Goal: make a memory allocator where the primary metric is that it should be easy to reason about for the compiler. Easy to reason about encourages inlining, inlining encourages vectorization
We go so far as to
typedef float** allocator;
float* alloc(allocator s, unsigned int size_floats) {
*s = *s - size_floats;
return *s
}
Then, instead of paying the complexity price of allocating while we are allocating, we will build scaffolding using operating system and hardware features to make the above safe.
First, how do we free memory? Never free, just discard allocators
#define allocator_push( old_allocator, new_allocator ) \
float * stack_allocator_storage = * old_allocator; \
allocator new_allocator = &stack_allocator_storage;
All functions will just assume that they will get an allocator as the first argument.
How do we expand memory? There are two options: On the one hand, we could never expand memory, and instead always initially allocate memory equal to the the size of physical memory. This will actually get claimed as it is written to. In this case, responding to the allocator filling up is responsibility of the operating system- neither the program nor the compiler know about it. Now our enemy is the OOM killer: malloc failing was friendly and catchable, the oom killer is mysterious and will start deleting processes as we write to too many pages in the allocated block.
Second option: allocate just a little memory at first, and catch the segfault from writing after the end of the allocated memory and expand it. I think that this is less demanding of system resources (especially if huge pages aren't turned on). It will require modifying malloc to reserve a section of virtual address space. since it requires controlling the virtual address space to forbid putting anything in the way of where the bump allocator could expand.
Basically, this is just a second stack. A motivating code sample for why bump allocators have huge ergonomic advantages: it works with array-programming habits I have from python.
//branchless bump allocator differential equation integration
matrix rk4(allocator source, vector (*dydt)(allocator, vector), vector y, float h,
float tfinal) {
float t = 0;
matrix output = new_matrix(source, y.len, tfinal / h, NULL);
int i = 0;
allocator_push(source, is)
while (t < tfinal - h) {
t += h;
vector k1 = dydt(is, y);
vector step = k1;
vector k2 = dydt(is, add_vv(is, y, mul_vs(is, k1, h / 2)));
step = add_vv(is, step, mul_vs(is, k2, 2));
vector k3 = dydt(is, add_vv(is, y, mul_vs(is, k2, h / 2)));
step = add_vv(is, step, mul_vs(is, k3, 2));
vector k4 = dydt(is, add_vv(is, y, mul_vs(is, k3, h)));
step = add_vv(is, step, k4);
step = mul_vs(is, step, h / 6.);
y = add_vv(is, y, step);
set_col(output, y, i);
// This is a little dodgy- reset
stack_allocator_storage = *old_allocator;
y = get_col(is, output, i);
i += 1;
}
return output;
}
Whereas, with the math functions allocating their return values, freeing is an absolute burden. (This is what RAII is for, of course, so the real question is whether there will be big performance gains.)
//glibc Malloc based differential equation integration
matrix rk4(allocator source, vector (*dydt)(vector), vector y, float h,
float tfinal) {
float t = 0;
matrix output = new_matrix(source, y.len, tfinal / h, NULL);
int i = 0;
while (t < tfinal - h) {
t += h;
vector k1 = dydt(y);
vector step = k1;
vector scaled_k1 = mul_vs(k1, h / 2);
vector y_plus_scaled_k1 = add_vv(scaled_k1, y);
vector k2 = dydt(y_plus_scaled_k1);
vector two_k2 = mul_vs(k2, 2);
vector step2 = add_vv(step, scaled_k2);
vector scaled_k2 = mul_vs(k2, h / 2);
vector y_plus_scaled_k2 = add_vv(y, scaled_k2);
vector k3 = dydt(y_plus_scaled_k2);
vector two_k3 = mul_vs(k3, 2);
vector step3 = add_vv(step2, two_k3);
vector scaled_k3 = mul_vs(k3, h);
vector y_plus_scaled_k3 = add_vv(y, scaled_k3);
vector k4 = dydt(y_plus_scaled_k3);
vector step4 = add_vv(is, step, k4);
vector scaled_step = mul_vs(step, h / 6.);
vector new_y = add_vv(y, scaled_step);
set_col(output, new_y, i);
// memory management
free_vector(k1 );
free_vector(step );
free_vector(scaled_k1 );
free_vector(y_plus_scaled_k1 );
free_vector(k2 );
free_vector(two_k2 );
free_vector(step2 );
free_vector(scaled_k2 );
free_vector(y_plus_scaled_k2 );
free_vector(k3 );
free_vector(two_k3 );
free_vector(step3 );
free_vector(scaled_k3 );
free_vector(y_plus_scaled_k3 );
free_vector(k4 );
free_vector(step4 );
free_vector(vector );
free_vector(new_y );
y = get_col(is, output, i);
i += 1;
}
return output;
}
Preliminary experiments:
As a first pass, we can confirm that at least the compiler is much happier reasoning about this memory allocation approach than the glibc malloc. We write a program to add two hardcoded vectors, and return the first element of the result:
#include <stdlib.h>
typedef struct {
float* data;
int len;
} vector;
typedef float** allocator;
float* alloc( allocator s, int len) {
*s = *s - len;
return *s;
}
vector add (allocator s, vector a, vector b) {
vector res;
res.data = alloc(s, a.len);
for(int i = 0; i < a.len; i++) {
res.data[i] = a.data[i] + b.data[i];
}
return res;
}
vector vec(allocator s, int len, float* data) {
vector res;
res.data = alloc(s, len);
res.len = len;
for(int i = 0; i < len; i++){
res.data[i] = data[i];
}
return res;
}
int main() {
float* memory = malloc(1000 * sizeof(float));
float* float_ptr = memory + 1000;
allocator s = &float_ptr;
vector a = vec(s, 2, (float[]){1, 2});
vector b = vec(s, 2, (float[]){3, 4});
vector c = add(s, a, b);
int ret = (int) c.data[0];
free(memory);
return ret;
}
gcc happily compiles this to
main:
mov eax, 4
ret
This is in contrast to the analogous program using a real allocator:
#include <stdlib.h>
typedef struct {
float* data;
int len;
} vector;
vector add (vector a, vector b) {
vector res;
res.data = malloc(a.len * sizeof(float));
for(int i = 0; i < a.len; i++) {
res.data[i] = a.data[i] + b.data[i];
}
return res;
}
vector vec(int len, float* data) {
vector res;
res.data = malloc(len * sizeof(float));
res.len = len;
for(int i = 0; i < len; i++){
res.data[i] = data[i];
}
return res;
}
int main() {
vector a = vec(2, (float[]){1, 2});
vector b = vec(2, (float[]){3, 4});
vector c = add(a, b);
int ret = (int) c.data[0];
free(a.data);
free(b.data);
free(c.data);
return ret;
}
which gcc can no longer inline, producing a bunch of function calls etc
main:
push r13
mov edi, 2
push r12
push rbp
push rbx
sub rsp, 24
mov rax, QWORD PTR .LC0[rip]
mov rsi, rsp
mov QWORD PTR [rsp], rax
call vec
lea rsi, [rsp+8]
mov edi, 2
mov r12, rax
mov rax, QWORD PTR .LC1[rip]
mov rbx, rdx
mov QWORD PTR [rsp+8], rax
call vec
mov rsi, rbx
mov rdi, r12
mov rcx, rdx
mov rdx, rax
mov rbp, rax
call add
mov rdi, r12
cvttss2si r13d, DWORD PTR [rax]
mov rbx, rax
call free
mov rdi, rbp
call free
mov rdi, rbx
call free
add rsp, 24
pop rbx
mov eax, r13d
pop rbp
pop r12
pop r13
ret
I have spent the remaining pre-project time fussing about with what it's like to program C in this style, by writing a toy physics engine. This is not yet science, but it's given me a sense of what features will need to be implemented.
The big annoyance so far is a lack of an ergonomic growable array. Runtime checks and logging in debug mode are absolutely critical. With the debug ifdefs turned on, the allocator struct currently looks less like **float and more like
typedef struct {
float *start;
float *current;
float *end;
char do_logging;
} allocator_storage;
typedef allocator_storage * allocator;
The target expermiments will be running workload such as the toy physics simulator in three parallel implementations: the branchless allocation approach here, raii style mallocing and freeing, and hard coding vector length and then moving everything to the stack.
I compiled the all-bump simulator into web assembly which lets me embed it into this proposal, which has to be worth something.