Proper performance measurement
Criterion is a statistics-driven benchmarking library for Rust that provides accurate, reliable performance measurements. Unlike simple timing measurements, Criterion uses statistical techniques to detect performance changes, identify outliers, and generate detailed reports with graphs and analysis.
Key features:
Naive benchmarking has many pitfalls:
// ❌ Unreliable benchmarking
use std::time::Instant;
fn bad_benchmark() {
let start = Instant::now();
expensive_function();
let duration = start.elapsed();
println!("Took {:?}", duration); // Single measurement, no statistics
}
Problems with simple timing:
use criterion::{black_box, criterion_group, criterion_main, Criterion};
fn fibonacci_recursive(n: u32) -> u32 {
match n {
0 => 0,
1 => 1,
n => fibonacci_recursive(n - 1) + fibonacci_recursive(n - 2),
}
}
fn fibonacci_iterative(n: u32) -> u32 {
let mut a = 0;
let mut b = 1;
for _ in 0..n {
let tmp = a;
a = b;
b = tmp + b;
}
a
}
fn criterion_benchmark(c: &mut Criterion) {
// Simple benchmark
c.bench_function("fib_recursive_20", |b| {
b.iter(|| fibonacci_recursive(black_box(20)))
});
c.bench_function("fib_iterative_20", |b| {
b.iter(|| fibonacci_iterative(black_box(20)))
});
}
criterion_group!(benches, criterion_benchmark);
criterion_main!(benches);
use criterion::{black_box, criterion_group, criterion_main, BenchmarkId, Criterion};
fn sort_vec(data: &mut Vec<i32>) {
data.sort();
}
fn sort_unstable_vec(data: &mut Vec<i32>) {
data.sort_unstable();
}
fn sorting_benchmark(c: &mut Criterion) {
let mut group = c.benchmark_group("sorting");
for size in [100, 1000, 10000, 100000].iter() {
// Benchmark stable sort
group.bench_with_input(BenchmarkId::new("stable", size), size, |b, &size| {
b.iter_batched(
|| (0..size).rev().collect::<Vec<_>>(),
|mut data| sort_vec(black_box(&mut data)),
criterion::BatchSize::SmallInput,
)
});
// Benchmark unstable sort
group.bench_with_input(BenchmarkId::new("unstable", size), size, |b, &size| {
b.iter_batched(
|| (0..size).rev().collect::<Vec<_>>(),
|mut data| sort_unstable_vec(black_box(&mut data)),
criterion::BatchSize::SmallInput,
)
});
}
group.finish();
}
criterion_group!(benches, sorting_benchmark);
criterion_main!(benches);
use criterion::{black_box, criterion_group, criterion_main, Criterion, Throughput};
fn process_bytes(data: &[u8]) -> u64 {
data.iter().map(|&b| b as u64).sum()
}
fn checksum(data: &[u8]) -> u32 {
data.iter().fold(0u32, |acc, &b| acc.wrapping_add(b as u32))
}
fn compress_simulate(data: &[u8]) -> Vec<u8> {
// Simulate compression
data.chunks(2).map(|chunk| chunk[0]).collect()
}
fn throughput_benchmark(c: &mut Criterion) {
let sizes = vec![1024, 10 * 1024, 100 * 1024, 1024 * 1024];
for size in sizes {
let data = vec![0xFFu8; size];
let mut group = c.benchmark_group("throughput");
group.throughput(Throughput::Bytes(size as u64));
group.bench_function(BenchmarkId::new("process_bytes", size), |b| {
b.iter(|| process_bytes(black_box(&data)))
});
group.bench_function(BenchmarkId::new("checksum", size), |b| {
b.iter(|| checksum(black_box(&data)))
});
group.bench_function(BenchmarkId::new("compress", size), |b| {
b.iter(|| compress_simulate(black_box(&data)))
});
group.finish();
}
}
criterion_group!(benches, throughput_benchmark);
criterion_main!(benches);
use criterion::{black_box, criterion_group, criterion_main, Criterion};
use std::collections::{HashMap, BTreeMap};
fn hashmap_insert(count: usize) {
let mut map = HashMap::new();
for i in 0..count {
map.insert(i, i * 2);
}
}
fn btreemap_insert(count: usize) {
let mut map = BTreeMap::new();
for i in 0..count {
map.insert(i, i * 2);
}
}
fn hashmap_lookup(map: &HashMap<usize, usize>, keys: &[usize]) -> usize {
keys.iter().filter_map(|k| map.get(k)).sum()
}
fn btreemap_lookup(map: &BTreeMap<usize, usize>, keys: &[usize]) -> usize {
keys.iter().filter_map(|k| map.get(k)).sum()
}
fn map_benchmark(c: &mut Criterion) {
let mut group = c.benchmark_group("map_insert");
for size in [100, 1000, 10000].iter() {
group.bench_with_input(BenchmarkId::new("HashMap", size), size, |b, &size| {
b.iter(|| hashmap_insert(black_box(size)))
});
group.bench_with_input(BenchmarkId::new("BTreeMap", size), size, |b, &size| {
b.iter(|| btreemap_insert(black_box(size)))
});
}
group.finish();
// Lookup benchmarks
let mut group = c.benchmark_group("map_lookup");
let size = 10000;
let hashmap: HashMap<_, _> = (0..size).map(|i| (i, i * 2)).collect();
let btreemap: BTreeMap<_, _> = (0..size).map(|i| (i, i * 2)).collect();
let keys: Vec<_> = (0..size).step_by(10).collect();
group.bench_function("HashMap", |b| {
b.iter(|| hashmap_lookup(black_box(&hashmap), black_box(&keys)))
});
group.bench_function("BTreeMap", |b| {
b.iter(|| btreemap_lookup(black_box(&btreemap), black_box(&keys)))
});
group.finish();
}
criterion_group!(benches, map_benchmark);
criterion_main!(benches);
use criterion::{
criterion_group, criterion_main, measurement::WallTime, BenchmarkGroup, Criterion,
};
use std::time::Duration;
fn allocate_vec(size: usize) -> Vec<u8> {
vec![0u8; size]
}
fn allocate_boxed_slice(size: usize) -> Box<[u8]> {
vec![0u8; size].into_boxed_slice()
}
fn memory_benchmark(c: &mut Criterion) {
let mut group = c.benchmark_group("memory_allocation");
// Configure measurement time
group.measurement_time(Duration::from_secs(10));
group.sample_size(1000);
for size in [1024, 10 * 1024, 100 * 1024].iter() {
group.bench_with_input(
criterion::BenchmarkId::new("Vec", size),
size,
|b, &size| {
b.iter(|| {
let v = allocate_vec(size);
criterion::black_box(v);
})
},
);
group.bench_with_input(
criterion::BenchmarkId::new("BoxedSlice", size),
size,
|b, &size| {
b.iter(|| {
let v = allocate_boxed_slice(size);
criterion::black_box(v);
})
},
);
}
group.finish();
}
criterion_group!(benches, memory_benchmark);
criterion_main!(benches);
use criterion::{black_box, criterion_group, criterion_main, BatchSize, Criterion};
use std::fs::File;
use std::io::{BufWriter, Write};
fn write_data_unbuffered(path: &str, data: &[u8]) -> std::io::Result<()> {
let mut file = File::create(path)?;
file.write_all(data)?;
Ok(())
}
fn write_data_buffered(path: &str, data: &[u8]) -> std::io::Result<()> {
let file = File::create(path)?;
let mut writer = BufWriter::new(file);
writer.write_all(data)?;
Ok(())
}
fn io_benchmark(c: &mut Criterion) {
let data = vec![0u8; 1024 * 1024]; // 1 MB
let mut group = c.benchmark_group("file_io");
group.bench_function("unbuffered", |b| {
b.iter_batched(
|| {
// Setup: create temp file path
let path = format!("/tmp/bench_unbuf_{}", rand::random::<u64>());
(path, data.clone())
},
|(path, data)| {
// Benchmark this
write_data_unbuffered(&path, &data).unwrap();
black_box(());
},
BatchSize::SmallInput,
);
// Teardown: files are not cleaned up in this simple example
});
group.bench_function("buffered", |b| {
b.iter_batched(
|| {
let path = format!("/tmp/bench_buf_{}", rand::random::<u64>());
(path, data.clone())
},
|(path, data)| {
write_data_buffered(&path, &data).unwrap();
black_box(());
},
BatchSize::SmallInput,
);
});
group.finish();
}
criterion_group!(benches, io_benchmark);
criterion_main!(benches);
use criterion::{black_box, criterion_group, criterion_main, Criterion};
fn old_implementation(n: usize) -> usize {
(0..n).sum()
}
fn new_implementation(n: usize) -> usize {
// Optimized version using formula
n * (n - 1) / 2
}
fn regression_benchmark(c: &mut Criterion) {
let mut group = c.benchmark_group("sum_optimization");
// Save baseline with: cargo bench --bench my_bench -- --save-baseline old
// Compare with: cargo bench --bench my_bench -- --baseline old
group.bench_function("old", |b| {
b.iter(|| old_implementation(black_box(10000)))
});
group.bench_function("new", |b| {
b.iter(|| new_implementation(black_box(10000)))
});
group.finish();
}
criterion_group!(benches, regression_benchmark);
criterion_main!(benches);
use criterion::{black_box, criterion_group, criterion_main, BenchmarkId, Criterion};
fn concat_push_str(strings: &[&str]) -> String {
let mut result = String::new();
for s in strings {
result.push_str(s);
}
result
}
fn concat_with_capacity(strings: &[&str]) -> String {
let capacity: usize = strings.iter().map(|s| s.len()).sum();
let mut result = String::with_capacity(capacity);
for s in strings {
result.push_str(s);
}
result
}
fn concat_collect(strings: &[&str]) -> String {
strings.iter().copied().collect()
}
fn concat_join(strings: &[&str]) -> String {
strings.join("")
}
fn string_benchmark(c: &mut Criterion) {
let mut group = c.benchmark_group("string_concat");
let test_strings = vec!["hello", "world", "foo", "bar", "baz"];
let sizes = vec![10, 100, 1000];
for size in sizes {
let strings: Vec<&str> = test_strings
.iter()
.cycle()
.take(size)
.copied()
.collect();
group.bench_with_input(BenchmarkId::new("push_str", size), &strings, |b, s| {
b.iter(|| concat_push_str(black_box(s)))
});
group.bench_with_input(
BenchmarkId::new("with_capacity", size),
&strings,
|b, s| b.iter(|| concat_with_capacity(black_box(s))),
);
group.bench_with_input(BenchmarkId::new("collect", size), &strings, |b, s| {
b.iter(|| concat_collect(black_box(s)))
});
group.bench_with_input(BenchmarkId::new("join", size), &strings, |b, s| {
b.iter(|| concat_join(black_box(s)))
});
}
group.finish();
}
criterion_group!(benches, string_benchmark);
criterion_main!(benches);
use criterion::{criterion_group, criterion_main, Criterion, profiler::Profiler};
use std::fs::File;
use std::path::Path;
// Custom profiler implementation
struct MyProfiler;
impl Profiler for MyProfiler {
fn start_profiling(&mut self, benchmark_id: &str, benchmark_dir: &Path) {
println!("Starting profiling for: {}", benchmark_id);
// Integration with perf, valgrind, etc.
}
fn stop_profiling(&mut self, benchmark_id: &str, benchmark_dir: &Path) {
println!("Stopping profiling for: {}", benchmark_id);
// Stop profiler and save data
}
}
fn heavy_computation(n: usize) -> usize {
(0..n).map(|i| i * i).sum()
}
fn profiling_benchmark(c: &mut Criterion) {
// c.with_profiler(MyProfiler);
c.bench_function("heavy_computation", |b| {
b.iter(|| heavy_computation(criterion::black_box(10000)))
});
}
criterion_group!(benches, profiling_benchmark);
criterion_main!(benches);
use criterion::{black_box, criterion_group, criterion_main, Criterion};
use tokio::runtime::Runtime;
async fn async_computation(n: usize) -> usize {
tokio::time::sleep(tokio::time::Duration::from_micros(n as u64)).await;
n * 2
}
async fn async_parallel(n: usize) -> Vec<usize> {
let mut tasks = Vec::new();
for i in 0..n {
tasks.push(tokio::spawn(async move { i * 2 }));
}
let mut results = Vec::new();
for task in tasks {
results.push(task.await.unwrap());
}
results
}
fn async_benchmark(c: &mut Criterion) {
let rt = Runtime::new().unwrap();
c.bench_function("async_computation", |b| {
b.to_async(&rt)
.iter(|| async { async_computation(black_box(100)).await })
});
c.bench_function("async_parallel_10", |b| {
b.to_async(&rt)
.iter(|| async { async_parallel(black_box(10)).await })
});
c.bench_function("async_parallel_100", |b| {
b.to_async(&rt)
.iter(|| async { async_parallel(black_box(100)).await })
});
}
criterion_group!(benches, async_benchmark);
criterion_main!(benches);
Criterion uses statistical methods to:
Before measurements:
Running many iterations:
black_box() prevents compiler optimizations:
// Without black_box - may optimize away
b.iter(|| fibonacci(20));
// With black_box - forces computation
b.iter(|| fibonacci(black_box(20)));
// ❌ DON'T: Compiler may optimize away
c.bench_function("bad", |b| {
b.iter(|| expensive_function(42))
});
// ✅ DO: Use black_box to prevent optimization
c.bench_function("good", |b| {
b.iter(|| expensive_function(black_box(42)))
});
// ❌ DON'T: Include setup time
c.bench_function("bad", |b| {
b.iter(|| {
let data = vec![0; 10000]; // Setup!
process(&data)
})
});
// ✅ DO: Use iter_batched for setup
c.bench_function("good", |b| {
b.iter_batched(
|| vec![0; 10000], // Setup
|data| process(&data), // Benchmark
BatchSize::SmallInput,
)
});
// ❌ DON'T: Reduce sample size too much
group.sample_size(10); // Not enough for statistics
// ✅ DO: Use reasonable sample size
group.sample_size(100); // Default is usually good
// ❌ DON'T: Ignore high variance in results
// If variance is high, results are unreliable
// ✅ DO: Investigate why variance is high
// - System load
// - Cache effects
// - Non-deterministic algorithms
// ❌ DON'T: Benchmark in debug mode
// cargo bench runs in release mode by default
// ✅ DO: Always benchmark release builds
// cargo bench --release (default)
use criterion::{
criterion_group, criterion_main, measurement::Measurement, BenchmarkGroup, Criterion,
};
use std::time::{Duration, Instant};
// Custom measurement that tracks allocations
struct AllocationCounter {
start_count: usize,
}
impl Measurement for AllocationCounter {
type Intermediate = usize;
type Value = usize;
fn start(&self) -> Self::Intermediate {
// In reality, you'd query allocator
0
}
fn end(&self, i: Self::Intermediate) -> Self::Value {
// Return allocation count
i
}
fn add(&self, v1: &Self::Value, v2: &Self::Value) -> Self::Value {
v1 + v2
}
fn zero(&self) -> Self::Value {
0
}
fn to_f64(&self, val: &Self::Value) -> f64 {
*val as f64
}
fn formatter(&self) -> &dyn criterion::measurement::ValueFormatter {
// Return custom formatter
&AllocationFormatter
}
}
struct AllocationFormatter;
impl criterion::measurement::ValueFormatter for AllocationFormatter {
fn format_value(&self, value: f64) -> String {
format!("{} allocations", value as usize)
}
fn format_throughput(&self, throughput: &criterion::Throughput, value: f64) -> String {
format!("{} allocs/op", value)
}
fn scale_values(&self, _: f64, _: &mut [f64]) -> &'static str {
"allocations"
}
fn scale_throughputs(&self, _: f64, _: &mut [f64]) -> &'static str {
"allocs/op"
}
fn scale_for_machines(&self, values: &mut [f64]) -> &'static str {
"allocations"
}
}
use criterion::{criterion_group, criterion_main, Criterion};
use pprof::criterion::{Output, PProfProfiler};
fn complex_computation() {
// Simulate complex work
let mut sum = 0;
for i in 0..10000 {
sum += fibonacci_recursive(i % 20);
}
criterion::black_box(sum);
}
fn fibonacci_recursive(n: u32) -> u32 {
match n {
0 => 0,
1 => 1,
n => fibonacci_recursive(n - 1) + fibonacci_recursive(n - 2),
}
}
fn flamegraph_benchmark(c: &mut Criterion) {
c.bench_function("complex", |b| b.iter(|| complex_computation()));
}
criterion_group! {
name = benches;
config = Criterion::default().with_profiler(PProfProfiler::new(100, Output::Flamegraph(None)));
targets = flamegraph_benchmark
}
criterion_main!(benches);
use criterion::{black_box, criterion_group, criterion_main, Criterion};
// Different vector initialization strategies
fn vec_with_capacity(n: usize) -> Vec<i32> {
let mut v = Vec::with_capacity(n);
for i in 0..n {
v.push(i as i32);
}
v
}
fn vec_from_iterator(n: usize) -> Vec<i32> {
(0..n as i32).collect()
}
fn vec_with_resize(n: usize) -> Vec<i32> {
let mut v = Vec::new();
v.resize(n, 0);
for i in 0..n {
v[i] = i as i32;
}
v
}
fn vec_from_macro(n: usize) -> Vec<i32> {
vec![0; n]
}
fn comparative_benchmark(c: &mut Criterion) {
let mut group = c.benchmark_group("vec_initialization");
let sizes = vec![10, 100, 1000, 10000];
for size in sizes {
group.bench_with_input(
criterion::BenchmarkId::new("with_capacity", size),
&size,
|b, &size| b.iter(|| vec_with_capacity(black_box(size))),
);
group.bench_with_input(
criterion::BenchmarkId::new("from_iterator", size),
&size,
|b, &size| b.iter(|| vec_from_iterator(black_box(size))),
);
group.bench_with_input(
criterion::BenchmarkId::new("with_resize", size),
&size,
|b, &size| b.iter(|| vec_with_resize(black_box(size))),
);
group.bench_with_input(
criterion::BenchmarkId::new("from_macro", size),
&size,
|b, &size| b.iter(|| vec_from_macro(black_box(size))),
);
}
group.finish();
}
criterion_group!(benches, comparative_benchmark);
criterion_main!(benches);
// Benchmarks for std collections
cargo bench --manifest-path library/std/Cargo.toml
// Criterion benchmarks for async runtime
#[tokio::test]
async fn benchmark_task_spawning() {
// ...
}
// Benchmarking serialization performance
criterion_group!(benches, json_benchmark, bincode_benchmark);
// Benchmarking regex compilation and matching
c.bench_function("compile", |b| b.iter(|| Regex::new(pattern)));
[dev-dependencies]
criterion = { version = "0.5", features = ["html_reports"] }
[[bench]]
name = "my_benchmark"
harness = false
use criterion::{Criterion, PlotConfiguration, AxisScale};
fn custom_criterion() -> Criterion {
Criterion::default()
.sample_size(1000) // More samples
.measurement_time(Duration::from_secs(10))
.warm_up_time(Duration::from_secs(3))
.noise_threshold(0.05) // 5% noise tolerance
.significance_level(0.05) // 95% confidence
.plot_config(PlotConfiguration::default()
.summary_scale(AxisScale::Logarithmic))
}
# Run all benchmarks
cargo bench
# Run specific benchmark
cargo bench --bench my_benchmark
# Save baseline
cargo bench -- --save-baseline master
# Compare with baseline
cargo bench -- --baseline master
# Filter benchmarks
cargo bench fibonacci
# Generate profiles
cargo bench --bench my_benchmark -- --profile-time=5
# List benchmarks without running
cargo bench -- --list
# Verbose output
cargo bench -- --verbose
# Quick mode (fewer samples)
cargo bench -- --quick
fib_recursive_20 time: [25.123 us 25.234 us 25.389 us]
change: [-1.2345% +0.1234% +1.4567%] (p = 0.89 > 0.05)
No change in performance detected.
fib_iterative_20 time: [123.45 ns 124.23 ns 125.67 ns]
change: [-15.234% -13.456% -11.234%] (p = 0.00 < 0.05)
Performance has improved.
thrpt: [789.12 KiB/s 812.34 KiB/s 845.67 KiB/s]
Found 5 outliers among 100 measurements (5.00%)
2 (2.00%) low mild
3 (3.00%) high mild
fn main() {
// Criterion benchmarks run via criterion_main! macro
// To run: cargo bench
println!("Run benchmarks with: cargo bench");
}
name: Benchmarks
on: [push, pull_request]
jobs:
benchmark:
runs-on: ubuntu-latest
steps:
- uses: actions/checkout@v2
- uses: actions-rs/toolchain@v1
with:
toolchain: stable
- name: Run benchmarks
run: cargo bench -- --save-baseline PR_${{ github.event.number }}
- name: Compare with master
run: cargo bench -- --baseline master
- name: Upload results
uses: actions/upload-artifact@v2
with:
name: benchmark-results
path: target/criterion/
// Ensure compiler doesn't optimize away your code
b.iter(|| {
let result = expensive_function(black_box(input));
black_box(result)
});
// LargeInput: Setup is expensive, amortize over many iterations
// SmallInput: Setup is cheap, run once per iteration
// PerIteration: Setup must run every time
b.iter_batched(setup, routine, BatchSize::SmallInput);
// Some benchmarks need extra warmup
group.warm_up_time(Duration::from_secs(5));
// Reduce system interference
// - Close other applications
// - Disable CPU frequency scaling
// - Use `nice` to set priority
// Save different baselines for comparison
// cargo bench -- --save-baseline v1.0
// cargo bench -- --save-baseline v2.0
// cargo bench -- --baseline v1.0
Run this code in the official Rust Playground