Struct packing and alignment
Memory layout optimization is the practice of arranging data structures to minimize memory usage, reduce cache misses, and improve CPU efficiency. This involves understanding struct packing, padding elimination, cache alignment, and data access patterns to create high-performance, memory-efficient code.
Key optimization techniques:
Poor memory layout causes:
Without optimization, you get:
use std::mem::{size_of, align_of};
// ❌ Poor ordering - lots of padding
#[derive(Debug)]
struct Unoptimized {
a: u8, // 1 byte
// 7 bytes padding
b: u64, // 8 bytes
c: u16, // 2 bytes
// 6 bytes padding
d: u64, // 8 bytes
e: u8, // 1 byte
// 7 bytes padding
}
// Total: 40 bytes
// ✅ Optimized ordering - minimal padding
#[derive(Debug)]
struct Optimized {
b: u64, // 8 bytes
d: u64, // 8 bytes
c: u16, // 2 bytes
a: u8, // 1 byte
e: u8, // 1 byte
// 4 bytes padding (to align to 8)
}
// Total: 24 bytes (40% reduction!)
fn example_field_ordering() {
println!("Unoptimized: size={}, align={}",
size_of::<Unoptimized>(), align_of::<Unoptimized>());
println!("Optimized: size={}, align={}",
size_of::<Optimized>(), align_of::<Optimized>());
// Demonstrate memory savings
let unopt_array = vec![Unoptimized {
a: 1, b: 2, c: 3, d: 4, e: 5
}; 1000];
let opt_array = vec![Optimized {
a: 1, b: 2, c: 3, d: 4, e: 5
}; 1000];
println!("1000 unoptimized: {} bytes",
size_of::<Unoptimized>() * 1000);
println!("1000 optimized: {} bytes",
size_of::<Optimized>() * 1000);
println!("Savings: {} bytes",
(size_of::<Unoptimized>() - size_of::<Optimized>()) * 1000);
}
use std::mem::size_of;
// ❌ Large enum due to biggest variant
#[derive(Debug)]
enum UnoptimizedEnum {
Small(u8),
Medium(u32),
Large([u8; 1024]), // This dominates the size
}
// ✅ Box large variants
#[derive(Debug)]
enum OptimizedEnum {
Small(u8),
Medium(u32),
Large(Box<[u8; 1024]>), // Heap allocation
}
// ✅ Even better: separate type for large variant
#[derive(Debug)]
enum SmallEnum {
Small(u8),
Medium(u32),
}
#[derive(Debug)]
struct LargeData([u8; 1024]);
fn example_enum_optimization() {
println!("UnoptimizedEnum size: {}", size_of::<UnoptimizedEnum>());
println!("OptimizedEnum size: {}", size_of::<OptimizedEnum>());
println!("SmallEnum size: {}", size_of::<SmallEnum>());
// Memory usage for array of enums
let unopt = vec![UnoptimizedEnum::Small(1); 1000];
let opt = vec![OptimizedEnum::Small(1); 1000];
println!("\n1000 UnoptimizedEnum: {} KB",
size_of::<UnoptimizedEnum>() * 1000 / 1024);
println!("1000 OptimizedEnum: {} KB",
size_of::<OptimizedEnum>() * 1000 / 1024);
}
use std::mem::size_of;
// ❌ Inefficient bool storage
#[derive(Debug)]
struct UnpackedFlags {
flag1: bool, // 1 byte
flag2: bool, // 1 byte
flag3: bool, // 1 byte
flag4: bool, // 1 byte
flag5: bool, // 1 byte
flag6: bool, // 1 byte
flag7: bool, // 1 byte
flag8: bool, // 1 byte
}
// Total: 8 bytes
// ✅ Bit-packed flags
#[derive(Debug, Clone, Copy)]
struct PackedFlags(u8);
impl PackedFlags {
const FLAG1: u8 = 1 << 0;
const FLAG2: u8 = 1 << 1;
const FLAG3: u8 = 1 << 2;
const FLAG4: u8 = 1 << 3;
const FLAG5: u8 = 1 << 4;
const FLAG6: u8 = 1 << 5;
const FLAG7: u8 = 1 << 6;
const FLAG8: u8 = 1 << 7;
fn new() -> Self {
PackedFlags(0)
}
fn set(&mut self, flag: u8) {
self.0 |= flag;
}
fn clear(&mut self, flag: u8) {
self.0 &= !flag;
}
fn is_set(&self, flag: u8) -> bool {
self.0 & flag != 0
}
fn toggle(&mut self, flag: u8) {
self.0 ^= flag;
}
}
// Total: 1 byte (87.5% reduction!)
fn example_bit_packing() {
println!("UnpackedFlags size: {}", size_of::<UnpackedFlags>());
println!("PackedFlags size: {}", size_of::<PackedFlags>());
let mut flags = PackedFlags::new();
flags.set(PackedFlags::FLAG1);
flags.set(PackedFlags::FLAG3);
println!("Flag1 set: {}", flags.is_set(PackedFlags::FLAG1));
println!("Flag2 set: {}", flags.is_set(PackedFlags::FLAG2));
println!("Flag3 set: {}", flags.is_set(PackedFlags::FLAG3));
}
use std::sync::atomic::{AtomicU64, Ordering};
use std::thread;
const CACHE_LINE_SIZE: usize = 64;
// ❌ False sharing - counters share cache line
struct FalseSharing {
counter1: AtomicU64,
counter2: AtomicU64,
}
// ✅ Prevent false sharing with padding
#[repr(align(64))]
struct CacheAligned {
counter: AtomicU64,
}
struct NoFalseSharing {
counter1: CacheAligned,
counter2: CacheAligned,
}
fn benchmark_false_sharing() {
let shared = std::sync::Arc::new(FalseSharing {
counter1: AtomicU64::new(0),
counter2: AtomicU64::new(0),
});
let start = std::time::Instant::now();
let handles: Vec<_> = (0..2)
.map(|i| {
let shared = shared.clone();
thread::spawn(move || {
let counter = if i == 0 { &shared.counter1 } else { &shared.counter2 };
for _ in 0..10_000_000 {
counter.fetch_add(1, Ordering::Relaxed);
}
})
})
.collect();
for handle in handles {
handle.join().unwrap();
}
let duration = start.elapsed();
println!("False sharing time: {:?}", duration);
}
fn benchmark_no_false_sharing() {
let shared = std::sync::Arc::new(NoFalseSharing {
counter1: CacheAligned { counter: AtomicU64::new(0) },
counter2: CacheAligned { counter: AtomicU64::new(0) },
});
let start = std::time::Instant::now();
let handles: Vec<_> = (0..2)
.map(|i| {
let shared = shared.clone();
thread::spawn(move || {
let counter = if i == 0 { &shared.counter1.counter } else { &shared.counter2.counter };
for _ in 0..10_000_000 {
counter.fetch_add(1, Ordering::Relaxed);
}
})
})
.collect();
for handle in handles {
handle.join().unwrap();
}
let duration = start.elapsed();
println!("No false sharing time: {:?}", duration);
}
use std::time::Instant;
// AoS: Array of Structures (traditional OOP style)
#[derive(Clone, Copy)]
struct Particle {
x: f32,
y: f32,
z: f32,
vx: f32,
vy: f32,
vz: f32,
mass: f32,
}
struct ParticlesAoS {
particles: Vec<Particle>,
}
impl ParticlesAoS {
fn new(count: usize) -> Self {
ParticlesAoS {
particles: vec![Particle {
x: 0.0, y: 0.0, z: 0.0,
vx: 1.0, vy: 1.0, vz: 1.0,
mass: 1.0,
}; count],
}
}
fn update_positions(&mut self, dt: f32) {
for particle in &mut self.particles {
particle.x += particle.vx * dt;
particle.y += particle.vy * dt;
particle.z += particle.vz * dt;
}
}
}
// SoA: Structure of Arrays (cache-friendly)
struct ParticlesSoA {
x: Vec<f32>,
y: Vec<f32>,
z: Vec<f32>,
vx: Vec<f32>,
vy: Vec<f32>,
vz: Vec<f32>,
mass: Vec<f32>,
}
impl ParticlesSoA {
fn new(count: usize) -> Self {
ParticlesSoA {
x: vec![0.0; count],
y: vec![0.0; count],
z: vec![0.0; count],
vx: vec![1.0; count],
vy: vec![1.0; count],
vz: vec![1.0; count],
mass: vec![1.0; count],
}
}
fn update_positions(&mut self, dt: f32) {
// Better cache locality - sequential access
for i in 0..self.x.len() {
self.x[i] += self.vx[i] * dt;
self.y[i] += self.vy[i] * dt;
self.z[i] += self.vz[i] * dt;
}
}
// Even better: iterator-based with potential auto-vectorization
fn update_positions_optimized(&mut self, dt: f32) {
self.x.iter_mut()
.zip(self.vx.iter())
.for_each(|(x, vx)| *x += vx * dt);
self.y.iter_mut()
.zip(self.vy.iter())
.for_each(|(y, vy)| *y += vy * dt);
self.z.iter_mut()
.zip(self.vz.iter())
.for_each(|(z, vz)| *z += vz * dt);
}
}
fn example_aos_vs_soa() {
const COUNT: usize = 1_000_000;
const ITERATIONS: usize = 100;
let dt = 0.016;
// Benchmark AoS
let mut aos = ParticlesAoS::new(COUNT);
let start = Instant::now();
for _ in 0..ITERATIONS {
aos.update_positions(dt);
}
let aos_time = start.elapsed();
println!("AoS time: {:?}", aos_time);
// Benchmark SoA
let mut soa = ParticlesSoA::new(COUNT);
let start = Instant::now();
for _ in 0..ITERATIONS {
soa.update_positions(dt);
}
let soa_time = start.elapsed();
println!("SoA time: {:?}", soa_time);
// Benchmark SoA optimized
let mut soa_opt = ParticlesSoA::new(COUNT);
let start = Instant::now();
for _ in 0..ITERATIONS {
soa_opt.update_positions_optimized(dt);
}
let soa_opt_time = start.elapsed();
println!("SoA optimized time: {:?}", soa_opt_time);
println!("SoA speedup: {:.2}x", aos_time.as_secs_f64() / soa_time.as_secs_f64());
}
use std::collections::HashMap;
use std::sync::Arc;
// ❌ Duplicate strings waste memory
struct UnoptimizedData {
records: Vec<(String, String, i32)>,
}
impl UnoptimizedData {
fn new() -> Self {
let mut records = Vec::new();
// Many records with duplicate strings
for i in 0..10000 {
records.push((
format!("user_{}", i % 100), // 100 unique users
format!("category_{}", i % 10), // 10 unique categories
i,
));
}
UnoptimizedData { records }
}
fn memory_usage(&self) -> usize {
self.records.iter()
.map(|(s1, s2, _)| s1.len() + s2.len() + 24) // String overhead
.sum()
}
}
// ✅ String interning saves memory
struct StringInterner {
strings: HashMap<String, Arc<str>>,
}
impl StringInterner {
fn new() -> Self {
StringInterner {
strings: HashMap::new(),
}
}
fn intern(&mut self, s: &str) -> Arc<str> {
if let Some(interned) = self.strings.get(s) {
Arc::clone(interned)
} else {
let arc: Arc<str> = Arc::from(s);
self.strings.insert(s.to_string(), Arc::clone(&arc));
arc
}
}
}
struct OptimizedData {
records: Vec<(Arc<str>, Arc<str>, i32)>,
}
impl OptimizedData {
fn new() -> Self {
let mut interner = StringInterner::new();
let mut records = Vec::new();
for i in 0..10000 {
records.push((
interner.intern(&format!("user_{}", i % 100)),
interner.intern(&format!("category_{}", i % 10)),
i,
));
}
OptimizedData { records }
}
fn memory_usage(&self) -> usize {
// Each Arc is just a pointer (8 bytes)
self.records.len() * (8 + 8 + 4)
}
}
fn example_string_interning() {
let unopt = UnoptimizedData::new();
let opt = OptimizedData::new();
println!("Unoptimized memory: ~{} bytes", unopt.memory_usage());
println!("Optimized memory: ~{} bytes", opt.memory_usage());
println!("Savings: ~{} bytes",
unopt.memory_usage() as i64 - opt.memory_usage() as i64);
}
use std::mem::size_of;
// ❌ Always heap-allocates
struct AlwaysHeap {
s: String,
}
// ✅ Small String Optimization
enum SmallString {
Inline([u8; 23], u8), // 23 bytes + 1 length byte
Heap(String),
}
impl SmallString {
fn new(s: &str) -> Self {
if s.len() <= 23 {
let mut buf = [0u8; 23];
buf[..s.len()].copy_from_slice(s.as_bytes());
SmallString::Inline(buf, s.len() as u8)
} else {
SmallString::Heap(s.to_string())
}
}
fn as_str(&self) -> &str {
match self {
SmallString::Inline(buf, len) => {
std::str::from_utf8(&buf[..*len as usize]).unwrap()
}
SmallString::Heap(s) => s.as_str(),
}
}
}
// Better: use smallvec or similar crate
use smallvec::SmallVec;
fn example_small_string() {
println!("String size: {}", size_of::<String>());
println!("SmallString size: {}", size_of::<SmallString>());
let short = SmallString::new("short");
let long = SmallString::new("this is a much longer string");
println!("Short: {} (inline: {})",
short.as_str(),
matches!(short, SmallString::Inline(_, _)));
println!("Long: {} (inline: {})",
long.as_str(),
matches!(long, SmallString::Inline(_, _)));
// SmallVec example
let mut v: SmallVec<[i32; 4]> = SmallVec::new();
v.push(1);
v.push(2);
v.push(3);
println!("SmallVec (inline): {:?}, spilled: {}", v, v.spilled());
v.push(4);
v.push(5);
println!("SmallVec (heap): {:?}, spilled: {}", v, v.spilled());
}
Modern CPUs have a cache hierarchy:
Optimizing for cache locality can make code 10-100x faster!
Accessing nearby memory is faster due to cache lines (typically 64 bytes):
// Good: Sequential access (one cache line)
for i in 0..array.len() {
sum += array[i];
}
// Bad: Random access (many cache lines)
for i in random_indices {
sum += array[i];
}
Recently accessed data is likely in cache:
// Good: Use data while it's hot
let x = expensive_computation();
use_value(x);
use_value_again(x);
// Bad: Recompute or reload from memory
let x = expensive_computation();
// ... lots of other code ...
let x = expensive_computation(); // Cache likely evicted
When multiple threads write to different variables on the same cache line, the CPU must synchronize, causing slowdowns.
Aligned data can be accessed in a single CPU instruction. Misaligned data may require multiple instructions or cause hardware exceptions.
// ❌ DON'T: Optimize before profiling
struct OverEngineered {
// Complicated bit-packing for rarely-used data
flags: u64,
}
// ✅ DO: Start simple, optimize hot paths
struct Simple {
flag1: bool,
flag2: bool,
}
use std::sync::atomic::AtomicU64;
// ❌ DON'T: Put hot atomics next to each other
struct BadCounters {
thread1_counter: AtomicU64,
thread2_counter: AtomicU64, // False sharing!
}
// ✅ DO: Pad to cache line
#[repr(align(64))]
struct GoodCounter {
counter: AtomicU64,
}
// ❌ DON'T: Multiple passes over data
let sum: i32 = vec.iter().map(|x| x * 2).sum();
let count: usize = vec.iter().filter(|x| **x > 10).count();
// ✅ DO: Single pass when possible
let (sum, count) = vec.iter()
.fold((0, 0), |(sum, count), &x| {
let doubled = x * 2;
(sum + doubled, count + if x > 10 { 1 } else { 0 })
});
// ❌ DON'T: Box small types unnecessarily
enum Bad {
Small(Box<u8>), // Heap allocation for 1 byte!
}
// ✅ DO: Box only large types
enum Good {
Small(u8),
Large(Box<[u8; 1024]>),
}
// ❌ DON'T: Unaligned data for SIMD
let data = vec![0.0f32; 1000];
// ✅ DO: Align for SIMD operations
#[repr(align(32))]
struct AlignedData([f32; 1000]);
use std::alloc::{alloc, dealloc, Layout};
use std::ptr::NonNull;
/// Fixed-size memory pool for fast allocation
struct MemoryPool<T> {
blocks: Vec<NonNull<T>>,
free_list: Vec<NonNull<T>>,
capacity: usize,
}
impl<T> MemoryPool<T> {
fn new(capacity: usize) -> Self {
let mut blocks = Vec::with_capacity(capacity);
let mut free_list = Vec::with_capacity(capacity);
unsafe {
let layout = Layout::array::<T>(capacity).unwrap();
let ptr = alloc(layout) as *mut T;
for i in 0..capacity {
let block = NonNull::new_unchecked(ptr.add(i));
blocks.push(block);
free_list.push(block);
}
}
MemoryPool {
blocks,
free_list,
capacity,
}
}
fn allocate(&mut self) -> Option<NonNull<T>> {
self.free_list.pop()
}
fn deallocate(&mut self, ptr: NonNull<T>) {
// In production, verify ptr is from this pool
self.free_list.push(ptr);
}
fn available(&self) -> usize {
self.free_list.len()
}
}
impl<T> Drop for MemoryPool<T> {
fn drop(&mut self) {
unsafe {
if !self.blocks.is_empty() {
let layout = Layout::array::<T>(self.capacity).unwrap();
dealloc(self.blocks[0].as_ptr() as *mut u8, layout);
}
}
}
}
fn memory_pool_example() {
let mut pool = MemoryPool::<i32>::new(1000);
let mut allocated = Vec::new();
for _ in 0..100 {
if let Some(ptr) = pool.allocate() {
allocated.push(ptr);
}
}
println!("Allocated: 100, Available: {}", pool.available());
for ptr in allocated {
pool.deallocate(ptr);
}
println!("After deallocation, Available: {}", pool.available());
}
/// Matrix stored in row-major order
struct Matrix {
data: Vec<f64>,
rows: usize,
cols: usize,
}
impl Matrix {
fn new(rows: usize, cols: usize) -> Self {
Matrix {
data: vec![0.0; rows * cols],
rows,
cols,
}
}
fn get(&self, row: usize, col: usize) -> f64 {
self.data[row * self.cols + col]
}
fn set(&mut self, row: usize, col: usize, value: f64) {
self.data[row * self.cols + col] = value;
}
// Naive matrix multiplication - poor cache behavior
fn multiply_naive(&self, other: &Matrix) -> Matrix {
assert_eq!(self.cols, other.rows);
let mut result = Matrix::new(self.rows, other.cols);
for i in 0..self.rows {
for j in 0..other.cols {
let mut sum = 0.0;
for k in 0..self.cols {
sum += self.get(i, k) * other.get(k, j);
}
result.set(i, j, sum);
}
}
result
}
// Cache-friendly blocked multiplication
fn multiply_blocked(&self, other: &Matrix, block_size: usize) -> Matrix {
assert_eq!(self.cols, other.rows);
let mut result = Matrix::new(self.rows, other.cols);
for i0 in (0..self.rows).step_by(block_size) {
for j0 in (0..other.cols).step_by(block_size) {
for k0 in (0..self.cols).step_by(block_size) {
// Process block
for i in i0..i0 + block_size.min(self.rows - i0) {
for j in j0..j0 + block_size.min(other.cols - j0) {
let mut sum = result.get(i, j);
for k in k0..k0 + block_size.min(self.cols - k0) {
sum += self.get(i, k) * other.get(k, j);
}
result.set(i, j, sum);
}
}
}
}
}
result
}
}
fn cache_oblivious_example() {
let size = 256;
let mut a = Matrix::new(size, size);
let mut b = Matrix::new(size, size);
// Initialize matrices
for i in 0..size {
for j in 0..size {
a.set(i, j, (i + j) as f64);
b.set(i, j, (i * j) as f64);
}
}
// Benchmark naive
let start = std::time::Instant::now();
let _ = a.multiply_naive(&b);
println!("Naive multiplication: {:?}", start.elapsed());
// Benchmark blocked
let start = std::time::Instant::now();
let _ = a.multiply_blocked(&b, 32);
println!("Blocked multiplication: {:?}", start.elapsed());
}
use std::mem::{self, MaybeUninit};
use std::ptr;
/// Vec with inline storage for small sizes
enum SmallVec<T, const N: usize> {
Inline {
data: [MaybeUninit<T>; N],
len: usize,
},
Heap {
ptr: *mut T,
len: usize,
capacity: usize,
},
}
impl<T, const N: usize> SmallVec<T, N> {
fn new() -> Self {
SmallVec::Inline {
data: unsafe { MaybeUninit::uninit().assume_init() },
len: 0,
}
}
fn push(&mut self, value: T) {
match self {
SmallVec::Inline { data, len } if *len < N => {
data[*len] = MaybeUninit::new(value);
*len += 1;
}
_ => {
// Need to spill to heap
self.spill_to_heap();
self.push(value);
}
}
}
fn spill_to_heap(&mut self) {
let old_self = mem::replace(self, SmallVec::Inline {
data: unsafe { MaybeUninit::uninit().assume_init() },
len: 0,
});
if let SmallVec::Inline { data, len } = old_self {
let capacity = (len * 2).max(N * 2);
let layout = std::alloc::Layout::array::<T>(capacity).unwrap();
let ptr = unsafe { std::alloc::alloc(layout) as *mut T };
unsafe {
for i in 0..len {
ptr.add(i).write(data[i].assume_init_read());
}
}
*self = SmallVec::Heap { ptr, len, capacity };
}
}
fn len(&self) -> usize {
match self {
SmallVec::Inline { len, .. } => *len,
SmallVec::Heap { len, .. } => *len,
}
}
fn as_slice(&self) -> &[T] {
match self {
SmallVec::Inline { data, len } => unsafe {
std::slice::from_raw_parts(data.as_ptr() as *const T, *len)
},
SmallVec::Heap { ptr, len, .. } => unsafe {
std::slice::from_raw_parts(*ptr, *len)
},
}
}
}
impl<T, const N: usize> Drop for SmallVec<T, N> {
fn drop(&mut self) {
match self {
SmallVec::Inline { data, len } => {
for i in 0..*len {
unsafe {
data[i].assume_init_drop();
}
}
}
SmallVec::Heap { ptr, len, capacity } => {
unsafe {
for i in 0..*len {
ptr.add(i).drop_in_place();
}
let layout = std::alloc::Layout::array::<T>(*capacity).unwrap();
std::alloc::dealloc(*ptr as *mut u8, layout);
}
}
}
}
}
// SoA for better cache locality
struct ComponentStorage {
positions: Vec<Position>,
velocities: Vec<Velocity>,
healths: Vec<Health>,
}
// Apache Arrow uses columnar layout
struct ColumnBatch {
ids: Vec<i64>,
names: Vec<String>,
ages: Vec<i32>,
}
#[repr(align(128))]
struct CachePadded<T> {
value: T,
}
use smallvec::{SmallVec, smallvec};
let mut v: SmallVec<[i32; 4]> = smallvec![1, 2, 3];
fn main() {
println!("=== Field Ordering ===");
example_field_ordering();
println!("\n=== Enum Optimization ===");
example_enum_optimization();
println!("\n=== Bit Packing ===");
example_bit_packing();
println!("\n=== False Sharing ===");
benchmark_false_sharing();
benchmark_no_false_sharing();
println!("\n=== AoS vs SoA ===");
example_aos_vs_soa();
println!("\n=== String Interning ===");
example_string_interning();
println!("\n=== Small String ===");
example_small_string();
println!("\n=== Memory Pool ===");
memory_pool_example();
println!("\n=== Cache-Oblivious ===");
cache_oblivious_example();
}
Run this code in the official Rust Playground