Portable SIMD and platform-specific
SIMD (Single Instruction, Multiple Data) is a form of parallel computation where a single instruction operates on multiple data points simultaneously. Modern CPUs have dedicated SIMD instruction sets (SSE, AVX, NEON) that can process 4, 8, or even 16 values at once, providing massive performance improvements for data-parallel operations.
SIMD is essential for:
Scalar code processes one element at a time:
// Scalar: 1 operation per instruction
for i in 0..data.len() {
result[i] = data[i] * 2.0;
}
// For 1000 elements: 1000 multiplications
With SIMD, you can process multiple elements per instruction:
// SIMD: 8 operations per instruction (AVX with f32)
// For 1000 elements: ~125 multiplications (8x speedup)
Challenges:
// The compiler can often auto-vectorize simple loops
// Scalar implementation
fn add_scalar(a: &[f32], b: &[f32], result: &mut [f32]) {
for i in 0..a.len() {
result[i] = a[i] + b[i];
}
}
// Iterator-based (easier to auto-vectorize)
fn add_iterator(a: &[f32], b: &[f32], result: &mut [f32]) {
result.iter_mut()
.zip(a.iter().zip(b.iter()))
.for_each(|(r, (&a, &b))| *r = a + b);
}
// Using chunks for explicit vectorization hint
fn add_chunks(a: &[f32], b: &[f32], result: &mut [f32]) {
const CHUNK_SIZE: usize = 8; // Hint for AVX
let len = a.len();
let chunks = len / CHUNK_SIZE;
let remainder = len % CHUNK_SIZE;
// Process chunks (likely vectorized)
for i in 0..chunks {
let start = i * CHUNK_SIZE;
for j in 0..CHUNK_SIZE {
result[start + j] = a[start + j] + b[start + j];
}
}
// Handle remainder
for i in (len - remainder)..len {
result[i] = a[i] + b[i];
}
}
fn example_auto_vectorization() {
let a = vec![1.0f32; 1000];
let b = vec![2.0f32; 1000];
let mut result = vec![0.0f32; 1000];
// Benchmark each approach
let start = std::time::Instant::now();
for _ in 0..10000 {
add_scalar(&a, &b, &mut result);
}
println!("Scalar: {:?}", start.elapsed());
let start = std::time::Instant::now();
for _ in 0..10000 {
add_iterator(&a, &b, &mut result);
}
println!("Iterator: {:?}", start.elapsed());
let start = std::time::Instant::now();
for _ in 0..10000 {
add_chunks(&a, &b, &mut result);
}
println!("Chunks: {:?}", start.elapsed());
}
#![feature(portable_simd)]
use std::simd::{f32x8, Simd, SimdFloat};
fn add_simd(a: &[f32], b: &[f32], result: &mut [f32]) {
const LANES: usize = 8; // f32x8 has 8 lanes
let len = a.len();
let simd_len = len - (len % LANES);
// SIMD processing
for i in (0..simd_len).step_by(LANES) {
let va = Simd::<f32, LANES>::from_slice(&a[i..]);
let vb = Simd::<f32, LANES>::from_slice(&b[i..]);
let vr = va + vb;
vr.copy_to_slice(&mut result[i..]);
}
// Scalar remainder
for i in simd_len..len {
result[i] = a[i] + b[i];
}
}
fn multiply_add_simd(a: &[f32], b: &[f32], c: &[f32], result: &mut [f32]) {
const LANES: usize = 8;
let len = a.len();
let simd_len = len - (len % LANES);
for i in (0..simd_len).step_by(LANES) {
let va = f32x8::from_slice(&a[i..]);
let vb = f32x8::from_slice(&b[i..]);
let vc = f32x8::from_slice(&c[i..]);
// result = a * b + c (fused multiply-add)
let vr = va.mul_add(vb, vc);
vr.copy_to_slice(&mut result[i..]);
}
// Scalar remainder
for i in simd_len..len {
result[i] = a[i] * b[i] + c[i];
}
}
fn dot_product_simd(a: &[f32], b: &[f32]) -> f32 {
const LANES: usize = 8;
let len = a.len();
let simd_len = len - (len % LANES);
let mut sum = f32x8::splat(0.0);
for i in (0..simd_len).step_by(LANES) {
let va = f32x8::from_slice(&a[i..]);
let vb = f32x8::from_slice(&b[i..]);
sum += va * vb;
}
// Horizontal sum
let result = sum.reduce_sum();
// Add remainder
let scalar_sum: f32 = (simd_len..len).map(|i| a[i] * b[i]).sum();
result + scalar_sum
}
#[cfg(target_arch = "x86_64")]
use std::arch::x86_64::*;
#[cfg(target_arch = "x86_64")]
unsafe fn add_avx(a: &[f32], b: &[f32], result: &mut [f32]) {
if !is_x86_feature_detected!("avx") {
// Fall back to scalar
return add_scalar(a, b, result);
}
const LANES: usize = 8; // AVX processes 8 f32s
let len = a.len();
let simd_len = len - (len % LANES);
for i in (0..simd_len).step_by(LANES) {
// Load 8 f32 values
let va = _mm256_loadu_ps(a.as_ptr().add(i));
let vb = _mm256_loadu_ps(b.as_ptr().add(i));
// Add vectors
let vr = _mm256_add_ps(va, vb);
// Store result
_mm256_storeu_ps(result.as_mut_ptr().add(i), vr);
}
// Scalar remainder
for i in simd_len..len {
result[i] = a[i] + b[i];
}
}
#[cfg(target_arch = "x86_64")]
unsafe fn multiply_avx(a: &[f32], b: &[f32], result: &mut [f32]) {
if !is_x86_feature_detected!("avx") {
return;
}
const LANES: usize = 8;
let len = a.len();
let simd_len = len - (len % LANES);
for i in (0..simd_len).step_by(LANES) {
let va = _mm256_loadu_ps(a.as_ptr().add(i));
let vb = _mm256_loadu_ps(b.as_ptr().add(i));
let vr = _mm256_mul_ps(va, vb);
_mm256_storeu_ps(result.as_mut_ptr().add(i), vr);
}
for i in simd_len..len {
result[i] = a[i] * b[i];
}
}
#[cfg(target_arch = "x86_64")]
unsafe fn fused_multiply_add_avx(a: &[f32], b: &[f32], c: &[f32], result: &mut [f32]) {
if !is_x86_feature_detected!("fma") {
return;
}
const LANES: usize = 8;
let len = a.len();
let simd_len = len - (len % LANES);
for i in (0..simd_len).step_by(LANES) {
let va = _mm256_loadu_ps(a.as_ptr().add(i));
let vb = _mm256_loadu_ps(b.as_ptr().add(i));
let vc = _mm256_loadu_ps(c.as_ptr().add(i));
// result = a * b + c
let vr = _mm256_fmadd_ps(va, vb, vc);
_mm256_storeu_ps(result.as_mut_ptr().add(i), vr);
}
for i in simd_len..len {
result[i] = a[i] * b[i] + c[i];
}
}
fn add_scalar(a: &[f32], b: &[f32], result: &mut [f32]) {
for i in 0..a.len() {
result[i] = a[i] + b[i];
}
}
#![feature(portable_simd)]
use std::simd::{u8x16, Simd};
/// RGB to Grayscale conversion
fn rgb_to_gray_scalar(rgb: &[u8], gray: &mut [u8]) {
assert_eq!(rgb.len(), gray.len() * 3);
for i in 0..gray.len() {
let r = rgb[i * 3] as u32;
let g = rgb[i * 3 + 1] as u32;
let b = rgb[i * 3 + 2] as u32;
// Weighted average: 0.299*R + 0.587*G + 0.114*B
gray[i] = ((r * 299 + g * 587 + b * 114) / 1000) as u8;
}
}
fn rgb_to_gray_simd(rgb: &[u8], gray: &mut [u8]) {
assert_eq!(rgb.len(), gray.len() * 3);
const LANES: usize = 16;
let len = gray.len();
let simd_len = len - (len % LANES);
for i in (0..simd_len).step_by(LANES) {
// Load RGB values
let r = u8x16::from_slice(&rgb[i * 3..]);
let g = u8x16::from_slice(&rgb[i * 3 + 1..]);
let b = u8x16::from_slice(&rgb[i * 3 + 2..]);
// Convert to u16 for arithmetic
// Simplified: just average
let gray_val = (r + g + b) / Simd::splat(3);
gray_val.copy_to_slice(&mut gray[i..]);
}
// Scalar remainder
for i in simd_len..len {
let r = rgb[i * 3] as u32;
let g = rgb[i * 3 + 1] as u32;
let b = rgb[i * 3 + 2] as u32;
gray[i] = ((r * 299 + g * 587 + b * 114) / 1000) as u8;
}
}
/// Box blur (averaging filter)
fn box_blur_scalar(src: &[u8], dst: &mut [u8], width: usize, height: usize, radius: usize) {
for y in radius..(height - radius) {
for x in radius..(width - radius) {
let mut sum = 0u32;
let mut count = 0u32;
for dy in 0..=(radius * 2) {
for dx in 0..=(radius * 2) {
let py = y + dy - radius;
let px = x + dx - radius;
sum += src[py * width + px] as u32;
count += 1;
}
}
dst[y * width + x] = (sum / count) as u8;
}
}
}
fn brightness_adjust_simd(pixels: &mut [u8], adjustment: i16) {
const LANES: usize = 16;
let len = pixels.len();
let simd_len = len - (len % LANES);
let adj = u8x16::splat(adjustment.clamp(-255, 255) as u8);
for i in (0..simd_len).step_by(LANES) {
let pixel = u8x16::from_slice(&pixels[i..]);
let result = pixel.saturating_add(adj);
result.copy_to_slice(&mut pixels[i..]);
}
// Scalar remainder
for i in simd_len..len {
pixels[i] = (pixels[i] as i16 + adjustment).clamp(0, 255) as u8;
}
}
#[cfg(target_arch = "x86_64")]
use std::arch::x86_64::*;
/// Count character occurrences
fn count_char_scalar(text: &[u8], ch: u8) -> usize {
text.iter().filter(|&&c| c == ch).count()
}
#[cfg(target_arch = "x86_64")]
unsafe fn count_char_simd(text: &[u8], ch: u8) -> usize {
if !is_x86_feature_detected!("avx2") {
return count_char_scalar(text, ch);
}
const LANES: usize = 32; // AVX2 with u8
let len = text.len();
let simd_len = len - (len % LANES);
let target = _mm256_set1_epi8(ch as i8);
let mut count = 0;
for i in (0..simd_len).step_by(LANES) {
// Load 32 bytes
let data = _mm256_loadu_si256(text.as_ptr().add(i) as *const __m256i);
// Compare
let cmp = _mm256_cmpeq_epi8(data, target);
// Count matches
let mask = _mm256_movemask_epi8(cmp);
count += mask.count_ones() as usize;
}
// Scalar remainder
count + text[simd_len..].iter().filter(|&&c| c == ch).count()
}
/// Find first occurrence of character
#[cfg(target_arch = "x86_64")]
unsafe fn find_char_simd(text: &[u8], ch: u8) -> Option<usize> {
if !is_x86_feature_detected!("avx2") {
return text.iter().position(|&c| c == ch);
}
const LANES: usize = 32;
let len = text.len();
let simd_len = len - (len % LANES);
let target = _mm256_set1_epi8(ch as i8);
for i in (0..simd_len).step_by(LANES) {
let data = _mm256_loadu_si256(text.as_ptr().add(i) as *const __m256i);
let cmp = _mm256_cmpeq_epi8(data, target);
let mask = _mm256_movemask_epi8(cmp);
if mask != 0 {
// Found match, find first bit set
return Some(i + mask.trailing_zeros() as usize);
}
}
// Check remainder
text[simd_len..].iter().position(|&c| c == ch).map(|pos| simd_len + pos)
}
/// Check if string contains only ASCII
#[cfg(target_arch = "x86_64")]
unsafe fn is_ascii_simd(text: &[u8]) -> bool {
if !is_x86_feature_detected!("avx2") {
return text.is_ascii();
}
const LANES: usize = 32;
let len = text.len();
let simd_len = len - (len % LANES);
for i in (0..simd_len).step_by(LANES) {
let data = _mm256_loadu_si256(text.as_ptr().add(i) as *const __m256i);
// Check if any byte has high bit set
let mask = _mm256_movemask_epi8(data);
if mask != 0 {
return false;
}
}
// Check remainder
text[simd_len..].is_ascii()
}
#![feature(portable_simd)]
use std::simd::{f32x8, Simd};
struct Matrix {
data: Vec<f32>,
rows: usize,
cols: usize,
}
impl Matrix {
fn new(rows: usize, cols: usize) -> Self {
Matrix {
data: vec![0.0; rows * cols],
rows,
cols,
}
}
fn get(&self, row: usize, col: usize) -> f32 {
self.data[row * self.cols + col]
}
fn set(&mut self, row: usize, col: usize, value: f32) {
self.data[row * self.cols + col] = value;
}
/// Matrix-vector multiplication (SIMD optimized)
fn multiply_vector_simd(&self, vec: &[f32]) -> Vec<f32> {
assert_eq!(self.cols, vec.len());
let mut result = vec![0.0; self.rows];
const LANES: usize = 8;
for row in 0..self.rows {
let row_start = row * self.cols;
let simd_len = self.cols - (self.cols % LANES);
let mut sum = f32x8::splat(0.0);
// SIMD processing
for col in (0..simd_len).step_by(LANES) {
let mat_vals = f32x8::from_slice(&self.data[row_start + col..]);
let vec_vals = f32x8::from_slice(&vec[col..]);
sum += mat_vals * vec_vals;
}
result[row] = sum.reduce_sum();
// Scalar remainder
for col in simd_len..self.cols {
result[row] += self.data[row_start + col] * vec[col];
}
}
result
}
/// Element-wise addition (SIMD)
fn add_simd(&self, other: &Matrix) -> Matrix {
assert_eq!(self.rows, other.rows);
assert_eq!(self.cols, other.cols);
let mut result = Matrix::new(self.rows, self.cols);
const LANES: usize = 8;
let len = self.data.len();
let simd_len = len - (len % LANES);
for i in (0..simd_len).step_by(LANES) {
let a = f32x8::from_slice(&self.data[i..]);
let b = f32x8::from_slice(&other.data[i..]);
let r = a + b;
r.copy_to_slice(&mut result.data[i..]);
}
for i in simd_len..len {
result.data[i] = self.data[i] + other.data[i];
}
result
}
/// Transpose (cache-friendly with SIMD)
fn transpose_simd(&self) -> Matrix {
let mut result = Matrix::new(self.cols, self.rows);
// Block size for cache efficiency
const BLOCK_SIZE: usize = 16;
for i_block in (0..self.rows).step_by(BLOCK_SIZE) {
for j_block in (0..self.cols).step_by(BLOCK_SIZE) {
let i_end = (i_block + BLOCK_SIZE).min(self.rows);
let j_end = (j_block + BLOCK_SIZE).min(self.cols);
for i in i_block..i_end {
for j in j_block..j_end {
result.set(j, i, self.get(i, j));
}
}
}
}
result
}
}
#![feature(portable_simd)]
use std::simd::{f32x8, Simd, SimdFloat};
fn sum_simd(data: &[f32]) -> f32 {
const LANES: usize = 8;
let len = data.len();
let simd_len = len - (len % LANES);
let mut sum = f32x8::splat(0.0);
for i in (0..simd_len).step_by(LANES) {
let vals = f32x8::from_slice(&data[i..]);
sum += vals;
}
let mut result = sum.reduce_sum();
// Scalar remainder
for i in simd_len..len {
result += data[i];
}
result
}
fn mean_simd(data: &[f32]) -> f32 {
sum_simd(data) / data.len() as f32
}
fn variance_simd(data: &[f32]) -> f32 {
let mean = mean_simd(data);
let mean_vec = f32x8::splat(mean);
const LANES: usize = 8;
let len = data.len();
let simd_len = len - (len % LANES);
let mut sum_sq = f32x8::splat(0.0);
for i in (0..simd_len).step_by(LANES) {
let vals = f32x8::from_slice(&data[i..]);
let diff = vals - mean_vec;
sum_sq += diff * diff;
}
let mut result = sum_sq.reduce_sum();
// Scalar remainder
for i in simd_len..len {
let diff = data[i] - mean;
result += diff * diff;
}
result / data.len() as f32
}
fn standard_deviation_simd(data: &[f32]) -> f32 {
variance_simd(data).sqrt()
}
fn min_max_simd(data: &[f32]) -> (f32, f32) {
const LANES: usize = 8;
let len = data.len();
let simd_len = len - (len % LANES);
let mut min_vec = f32x8::splat(f32::INFINITY);
let mut max_vec = f32x8::splat(f32::NEG_INFINITY);
for i in (0..simd_len).step_by(LANES) {
let vals = f32x8::from_slice(&data[i..]);
min_vec = min_vec.simd_min(vals);
max_vec = max_vec.simd_max(vals);
}
let mut min = min_vec.reduce_min();
let mut max = max_vec.reduce_max();
// Scalar remainder
for i in simd_len..len {
min = min.min(data[i]);
max = max.max(data[i]);
}
(min, max)
}
fn normalize_simd(data: &mut [f32]) {
let (min, max) = min_max_simd(data);
let range = max - min;
if range == 0.0 {
return;
}
const LANES: usize = 8;
let len = data.len();
let simd_len = len - (len % LANES);
let min_vec = f32x8::splat(min);
let range_vec = f32x8::splat(range);
for i in (0..simd_len).step_by(LANES) {
let vals = f32x8::from_slice(&data[i..]);
let normalized = (vals - min_vec) / range_vec;
normalized.copy_to_slice(&mut data[i..]);
}
// Scalar remainder
for i in simd_len..len {
data[i] = (data[i] - min) / range;
}
}
SIMD exploits data parallelism where the same operation applies to multiple data elements:
Scalar: [a] * [b] = [a*b] (1 operation)
SIMD: [a,b,c,d] * [e,f,g,h] = [a*e, b*f, c*g, d*h] (4 operations in parallel)
SIMD instructions keep CPU pipelines full:
SIMD reduces memory bottlenecks:
Modern CPUs can execute multiple SIMD instructions simultaneously:
// ❌ DON'T: Unaligned data may be slower
let data = vec![0.0f32; 1000];
// ✅ DO: Ensure alignment for best performance
#[repr(align(32))]
struct AlignedData([f32; 1000]);
// ❌ DON'T: Miss remainder elements
fn bad_sum_simd(data: &[f32]) -> f32 {
let mut sum = f32x8::splat(0.0);
for i in (0..data.len()).step_by(8) {
sum += f32x8::from_slice(&data[i..]); // May panic!
}
sum.reduce_sum()
}
// ✅ DO: Handle remainder correctly
fn good_sum_simd(data: &[f32]) -> f32 {
let simd_len = data.len() - (data.len() % 8);
let mut sum = f32x8::splat(0.0);
for i in (0..simd_len).step_by(8) {
sum += f32x8::from_slice(&data[i..]);
}
let mut result = sum.reduce_sum();
for i in simd_len..data.len() {
result += data[i];
}
result
}
// ❌ DON'T: Branching inside SIMD loop
fn bad_conditional(data: &mut [f32]) {
for i in 0..data.len() {
if data[i] > 0.0 {
data[i] *= 2.0;
}
}
}
// ✅ DO: Use SIMD select/blend
fn good_conditional(data: &mut [f32]) {
const LANES: usize = 8;
let simd_len = data.len() - (data.len() % LANES);
for i in (0..simd_len).step_by(LANES) {
let vals = f32x8::from_slice(&data[i..]);
let mask = vals.simd_gt(f32x8::splat(0.0));
let doubled = vals * f32x8::splat(2.0);
let result = mask.select(doubled, vals);
result.copy_to_slice(&mut data[i..]);
}
}
// ❌ DON'T: Random access patterns
fn bad_gather(data: &[f32], indices: &[usize]) -> Vec<f32> {
indices.iter().map(|&i| data[i]).collect()
}
// ✅ DO: Reorganize for sequential access when possible
fn good_sequential(data: &[f32]) -> Vec<f32> {
data.iter().copied().collect()
}
// ❌ DON'T: SIMD for tiny datasets
fn overkill(a: f32, b: f32) -> f32 {
let va = f32x8::splat(a);
let vb = f32x8::splat(b);
(va + vb).reduce_sum() / 8.0
}
// ✅ DO: Use scalar for small operations
fn sensible(a: f32, b: f32) -> f32 {
a + b
}
// Image processing uses SIMD internally
use image::*;
use rayon::prelude::*;
data.par_iter_mut()
.for_each(|x| *x *= 2.0); // Often vectorized
#![feature(portable_simd)]
use std::simd::*;
use simdeez::*;
simd_runtime_generate!(
fn add_arrays(a: &[f32], b: &[f32]) -> Vec<f32> {
// Automatically dispatches to best SIMD
}
);
fn main() {
println!("=== Auto-Vectorization ===");
example_auto_vectorization();
println!("\n=== Explicit SIMD (requires nightly) ===");
// example_explicit_simd();
println!("\n=== Architecture-Specific ===");
// Run architecture-specific examples
println!("\n=== Image Processing ===");
// Image processing examples
println!("\n=== String Operations ===");
// String operation examples
println!("\n=== Matrix Operations ===");
// Matrix operation examples
println!("\n=== Statistical Operations ===");
// Statistical operation examples
}
Run this code in the official Rust Playground