Tutorial 1: Tensors
Tensors are the fundamental data type in floDl — n-dimensional arrays of numbers backed by libtorch. This tutorial covers creation, operations, error handling, and memory management.
Creating Tensors
All creation functions return Result<Tensor>.
use flodl::{Tensor, TensorOptions, Device, DType};
// From Rust data — data is copied into libtorch
let t = Tensor::from_f32(&[1.0, 2.0, 3.0, 4.0, 5.0, 6.0], &[2, 3], Device::CPU)?;
// Filled tensors
let opts = TensorOptions::default(); // Float32, CPU
let zeros = Tensor::zeros(&[3, 4], opts)?;
let ones = Tensor::ones(&[3, 4], opts)?;
// Random tensors (seed with manual_seed() for reproducibility)
let uniform = Tensor::rand(&[2, 3], opts)?; // values in [0, 1)
let normal = Tensor::randn(&[2, 3], opts)?; // standard normal
// Integer tensor (for indices, e.g. Embedding lookups)
let idx = Tensor::from_i64(&[0, 3, 7], &[3], Device::CPU)?;
Options
TensorOptions is a plain struct with dtype and device fields:
let opts = TensorOptions { dtype: DType::Float64, ..Default::default() };
let t = Tensor::ones(&[4], opts)?;
let gpu_opts = TensorOptions { device: Device::CUDA(0), ..Default::default() };
let t = Tensor::zeros(&[3, 3], gpu_opts)?;
Shape Inspection
let t = Tensor::from_f32(&[1.0, 2.0, 3.0, 4.0, 5.0, 6.0], &[2, 3], Device::CPU)?;
t.shape(); // [2, 3]
t.ndim(); // 2
t.numel(); // 6
t.dtype(); // DType::Float32
t.device(); // Device::CPU
Operations
Operations return new tensors — originals are never modified. Every operation
returns Result<Tensor>, and the ? operator propagates errors:
let a = Tensor::from_f32(&[1.0, 2.0, 3.0, 4.0], &[2, 2], Device::CPU)?;
let b = Tensor::from_f32(&[5.0, 6.0, 7.0, 8.0], &[2, 2], Device::CPU)?;
let result = a.add(&b)?.matmul(&b)?.relu()?;
Arithmetic
a.add(&b)? // element-wise a + b
a.sub(&b)? // element-wise a - b
a.mul(&b)? // element-wise a * b (Hadamard product)
a.div(&b)? // element-wise a / b
a.matmul(&b)? // matrix multiplication
a.mul_scalar(0.5)? // multiply every element by a scalar
a.add_scalar(1.0)? // add a scalar to every element
Activations
t.relu()? // max(0, x)
t.sigmoid()? // 1 / (1 + exp(-x))
t.tanh_op()? // hyperbolic tangent
t.gelu()? // Gaussian Error Linear Unit
t.silu()? // x * sigmoid(x) (Swish)
t.selu()? // scaled ELU (self-normalizing)
t.hardswish()? // efficient Swish approximation
t.hardsigmoid()? // piecewise-linear sigmoid
t.prelu(&weight)? // parametric ReLU
t.softmax(-1)? // softmax along dimension
t.log_softmax(-1)? // log(softmax(x)) — numerically stable
Math
t.exp()? // e^x
t.log()? // ln(x)
t.sqrt()? // square root
t.neg()? // negation
t.abs()? // absolute value
t.pow_scalar(2.0)? // element-wise power
t.clamp(-1.0, 1.0)? // clamp to range
t.clamp_min(0.0)? // clamp from below
t.clamp_max(1.0)? // clamp from above
t.reciprocal()? // 1/x
t.sign()? // sign (-1, 0, or 1)
t.floor()? // round down
t.ceil()? // round up
t.round()? // round to nearest
t.trunc()? // truncate toward zero
t.frac()? // fractional part
Trigonometry
t.sin()? // sine
t.cos()? // cosine
t.tan()? // tangent
t.asin()? // arc sine
t.acos()? // arc cosine
t.atan()? // arc tangent
Numerically Stable
t.log1p()? // ln(1 + x) — stable for small x
t.expm1()? // exp(x) - 1 — stable for small x
t.log2()? // log base 2
t.log10()? // log base 10
t.erf()? // Gauss error function
t.erfc()? // complementary error function (1 - erf)
Modular Arithmetic
t.fmod(3.0)? // C-style remainder (scalar)
t.fmod_tensor(&divisor)? // C-style remainder (tensor)
t.remainder(3.0)? // Python-style modulo (scalar)
t.remainder_tensor(&other)? // Python-style modulo (tensor)
Fused Operations
// self + mat1 @ mat2 * alpha + self * beta — single kernel
t.addmm(&mat1, &mat2, 1.0, 1.0)?;
// self + tensor1 * tensor2 * value — fused multiply-accumulate
t.addcmul(&t1, &t2, 1.0)?;
// self + tensor1 / tensor2 * value — fused divide-accumulate
t.addcdiv(&t1, &t2, 1.0)?;
// linear interpolation: self + (end - self) * weight
t.lerp(&end, 0.5)?;
t.lerp_tensor(&end, &weights)?; // per-element weight
// element-wise closeness check
t.isclose(&other, 1e-5, 1e-8)?;
Reductions
t.sum()? // reduce all elements to scalar
t.mean()? // mean of all elements
t.sum_dim(1, true)? // reduce along dim, keep dimension
t.mean_dim(1, true)? // mean along dim
t.max()? // scalar max
t.min()? // scalar min
t.max_dim(1, true)? // max along dim, keep dimension
t.min_dim(1, true)? // min along dim, keep dimension
t.argmax(-1)? // index of max along dim
t.var()? // variance of all elements
t.std()? // standard deviation
t.var_dim(1, true)? // variance along dim
t.std_dim(1, true)? // std along dim
t.prod()? // product of all elements
t.prod_dim(1, true)? // product along dim
t.cumsum(0)? // cumulative sum along dim
t.logsumexp(1, true)? // log(sum(exp(x))) — numerically stable
t.norm()? // L2 norm of all elements
Shape Manipulation
t.reshape(&[6, 1])? // new shape, same data
t.transpose(0, 1)? // swap two dimensions
t.flatten(0, -1)? // flatten all dims
t.squeeze(0)? // remove dim of size 1
t.unsqueeze(0)? // add dim of size 1
t.unsqueeze_many(&[0, 2])? // add multiple dims at once
t.permute(&[1, 0])? // arbitrary axis reorder
t.contiguous()? // ensure contiguous memory layout
t.movedim(0, 2)? // move dimension to new position
t.flip(&[0, 1])? // reverse along dimensions
t.roll(2, 0)? // circular shift along dim
t.diagonal(0, 0, 1)? // extract diagonal
t.tile(&[2, 3])? // repeat by tiling
t.triu(0)? // upper triangular
t.tril(0)? // lower triangular
Slicing and Joining
t.narrow(0, 1, 2)? // extract a contiguous slice along dim
t.select(0, 1)? // pick one index along dim, removing that dim
a.cat(&b, 0)? // concatenate two tensors along dim
Tensor::cat_many(&[&a, &b, &c], 0)? // concatenate many tensors
Tensor::stack(&[&a, &b], 0)? // stack along new dim
t.index_select(0, &indices)? // gather slices at given indices
t.split(2, 0)? // split into chunks of size 2 along dim
t.chunk(3, 0)? // split into N equal chunks along dim
t.unbind(0)? // remove a dim, returning Vec<Tensor>
t.repeat(&[2, 3])? // repeat tensor along each dim
t.pad(&[1, 1], 0.0)? // constant-value padding
t.pad_mode(&[1, 1], 1, 0.0)? // mode: 0=constant, 1=reflect, 2=replicate, 3=circular
t.batches(32)? // split into mini-batches along dim 0
Tensor::meshgrid(&[&x, &y])? // coordinate grids (ij indexing)
Comparisons and Conditionals
let mask = x.gt(&threshold)?; // element-wise >
let mask = x.gt_scalar(0.0)?; // compare with scalar
let mask = x.lt_scalar(1.0)?; // less than scalar
let mask = x.ge_scalar(0.0)?; // greater or equal
let mask = x.le_scalar(1.0)?; // less or equal
let y = Tensor::where_cond(&mask, &a, &b)?; // conditional select
// Element-wise min/max of two tensors
let z = a.maximum(&b)?;
let z = a.minimum(&b)?;
Similarity and Normalization
// Cosine similarity along a dimension
let sim = a.cosine_similarity(&b, 1, 1e-8)?;
// Lp normalization along a dimension
let normed = t.normalize(2.0, 1)?; // L2-normalize along dim 1
// Masked fill — set elements where mask is true
let y = t.masked_fill(&mask, 0.0)?;
Extracting Data
Copy tensor data back to Rust vectors:
let t = Tensor::from_f32(&[1.0, 2.0, 3.0], &[3], Device::CPU)?;
let data: Vec<f32> = t.to_f32_vec()?; // [1.0, 2.0, 3.0]
let item: f64 = t.select(0, 0)?.item()?; // scalar value as f64
to_f64_vec() and to_i64_vec() are also available.
Memory Management
Tensors are backed by C++ memory managed through libtorch. Rust’s ownership
system handles cleanup automatically via Drop — you never need to free
tensors manually.
{
let t = Tensor::zeros(&[1000, 1000], TensorOptions::default())?;
// ... use t ...
} // t is dropped here — C++ memory freed immediately
This is a fundamental advantage over garbage-collected languages. In Go,
tensor memory can linger until the GC runs, requiring explicit Release()
calls and VRAM budget heuristics. In Rust, memory is freed deterministically
at the end of the owning scope.
Clone on a Tensor shares the underlying data (like PyTorch’s shallow
copy). The C++ TensorImpl is reference-counted internally by libtorch.
Diagnostics
Utilities for debugging memory and performance issues:
use flodl::{live_tensor_count, rss_kb};
// Number of live C++ Tensor handles (created but not yet dropped).
// If this grows over time, there is a tensor handle leak.
println!("live handles: {}", live_tensor_count());
// Current process RSS in kilobytes (Linux only).
println!("RSS: {}MB", rss_kb() / 1024);
// Count autograd nodes reachable from a tensor — measures graph
// complexity. Compare against Python to detect decomposed-op bloat.
let loss = mse_loss(&pred, &target)?;
println!("autograd nodes: {}", loss.data().autograd_node_count());
Peak VRAM Tracking
On CUDA devices, you can track peak GPU memory usage during a training step or any other section of code. These match the standard PyTorch memory diagnostics:
| flodl | PyTorch equivalent |
|---|---|
cuda_peak_active_bytes() |
torch.cuda.max_memory_allocated() |
cuda_peak_reserved_bytes() |
torch.cuda.max_memory_reserved() |
cuda_reset_peak_stats() |
torch.cuda.reset_peak_memory_stats() |
“Active” bytes are memory currently holding tensor data. “Reserved” bytes include the CUDA caching allocator’s free pool — memory that libtorch has obtained from the driver but is not currently in use by any tensor. The gap between the two tells you how much allocator headroom exists.
A typical pattern for profiling a training step:
use flodl::{cuda_reset_peak_stats, cuda_peak_active_bytes, cuda_peak_reserved_bytes,
cuda_empty_cache};
// Flush the allocator cache so reserved starts from a clean baseline
cuda_empty_cache();
cuda_reset_peak_stats();
// --- run one training step ---
let output = model.forward(&batch)?;
let loss = mse_loss(&output, &targets)?;
loss.backward()?;
optimizer.step()?;
optimizer.zero_grad()?;
// Read peaks
let active_mb = cuda_peak_active_bytes()? as f64 / 1048576.0;
let reserved_mb = cuda_peak_reserved_bytes()? as f64 / 1048576.0;
println!("peak active: {active_mb:.1} MB, peak reserved: {reserved_mb:.1} MB");
The _idx variants (cuda_peak_active_bytes_idx, cuda_peak_reserved_bytes_idx,
cuda_reset_peak_stats_idx) accept an explicit device index for multi-GPU setups.
Device Transfer
let gpu = t.to_device(Device::CUDA(0))?; // move to GPU
let cpu = gpu.to_device(Device::CPU)?; // move back to CPU
if flodl::cuda_available() {
println!("CUDA devices: {}", flodl::cuda_device_count());
}
Non-blocking Device Transfer
The default to_device() blocks until the transfer completes. For CPU-to-GPU
transfers, you can overlap the copy with CPU work by using pinned memory and an
asynchronous transfer:
use flodl::{Tensor, Device, TensorOptions, cuda_synchronize};
let cpu_tensor = Tensor::randn(&[256, 512], TensorOptions::default())?;
// Pin the CPU tensor into page-locked memory (requires CUDA)
let pinned = cpu_tensor.pin_memory()?;
// Launch the transfer — returns immediately
let gpu = pinned.to_device_async(Device::CUDA(0))?;
// ... do CPU work while the DMA transfer runs ...
// Synchronize before using the GPU tensor
cuda_synchronize(0);
pin_memory() allocates the tensor in page-locked (pinned) host memory, which
the GPU can DMA directly without an intermediate staging copy. This matters most
when you are streaming large batches to the GPU and want to keep the CPU busy
with data preprocessing for the next batch.
You can check whether a tensor is already pinned with t.is_pinned().
Reproducibility
Seed libtorch’s RNG before creating random tensors or models:
flodl::manual_seed(42); // seeds CPU + CUDA RNGs
After seeding, Tensor::rand, Tensor::randn, dropout masks, and weight
initialization all produce deterministic results.
For CPU-side randomness (shuffling datasets, augmentation), use the Rng
struct:
use flodl::Rng;
let mut rng = Rng::seed(42);
rng.shuffle(&mut data);
See Tutorial 4: Training for full reproducibility setup.
cuDNN Benchmark Mode
For fixed-size workloads (fixed batch size, fixed image dimensions), enabling cuDNN benchmark mode lets cuDNN auto-tune convolution algorithms on the first call:
flodl::set_cudnn_benchmark(true); // opt-in, 5-10% speedup for fixed shapes
Leave this off for dynamic-shape workloads (variable-length sequences, multi-resolution images) — the warmup cost can hurt throughput.
Memory Format (Channels Last)
By default, 4D tensors use the NCHW memory layout (batch, channels, height, width). GPUs with Tensor Cores (NVIDIA Volta and newer) can run convolutions significantly faster when tensors are stored in NHWC order, also called “channels last.”
let images = Tensor::randn(&[8, 3, 224, 224], TensorOptions::default())?;
// Convert to channels-last layout
let images_cl = images.to_channels_last()?;
assert!(images_cl.is_channels_last());
assert_eq!(images_cl.shape(), &[8, 3, 224, 224]); // logical shape unchanged
The logical shape stays the same — only the physical memory stride order changes. This avoids format-conversion overhead inside cuDNN convolution kernels. On Tensor Core GPUs, expect an 8–35% speedup for Conv2d-heavy workloads, depending on layer sizes and batch dimensions.
Convert your input tensors and model weights to channels-last format before training. flodl’s Conv2d will pick up the layout automatically.
Foreach Operations
Foreach operations apply the same operation to a list of tensors in a single fused CUDA kernel launch. When you have dozens or hundreds of parameter tensors (typical in any real model), this eliminates per-tensor kernel launch overhead.
All foreach functions are associated functions on Tensor:
use flodl::Tensor;
let params: Vec<Tensor> = model.parameters();
// Zero all parameter gradients in one launch
Tensor::foreach_zero_(¶ms)?;
// Scale every tensor by a constant
Tensor::foreach_mul_scalar_(¶ms, 0.99)?;
// Add a constant to every tensor
Tensor::foreach_add_scalar_(¶ms, 1e-8)?;
There are also multi-list variants that operate on pairs of tensor lists element-wise:
// params[i] += grads[i] * alpha, for all i
Tensor::foreach_add_list_(¶ms, &grads, -0.01)?;
// params[i] = lerp(params[i], targets[i], weight), for all i
Tensor::foreach_lerp_scalar_(¶ms, &targets, 0.1)?;
// In-place sqrt of every tensor
Tensor::foreach_sqrt_(¶ms)?;
// Compute the L2 norm of each tensor — returns a Vec<Tensor> of scalars
let norms: Vec<Tensor> = Tensor::foreach_norm(¶ms, 2.0)?;
You typically do not need to call these directly. flodl’s fused optimizers (Adam, AdamW) and gradient clipping routines use foreach operations internally to minimize kernel launch overhead. They are exposed in the public API for advanced use cases like custom optimizers or manual parameter surgery.