James Routley

A while back I was messing around with eBPF and I couldn’t find any good materials on using BPF without libbpf/LLVM/GCC (i.e. rawdogging bytecode), so here’s the resource I wished I had.

I’ll be using Linux version 6.18 in this post; eBPF moves pretty fast and the verifier gets smarter with every release, so the information in this post might be out of date when you read this.

In conjunction with this post, I would recommend reading the documentation (especially to supplement parts that might be confusing):

BPF ISA reference
BPF syscall reference
BTF reference
ebpf.io docs (for maps, program types, helpers/KFuncs)

eBPF Hello World

Linux exposes eBPF functionality to user space through the bpf syscall, whose functionality is split into several subcommands.

The basic flow of eBPF is to create an array of eBPF bytecode instructions, pass it to the BPF_PROG_LOAD subcommand along with a program type (e.g. socket filter), and then attach it to the corresponding resource (like a socket or a kprobe).

Zig

const std = @import("std");

const linux = std.os.linux;
const posix = std.posix;
const BPF = linux.BPF;
const AF = linux.AF;
const SOCK = linux.SOCK;
const SOL = linux.SOL;
const SO = linux.SO;

pub fn main() !void {
    const insns = [_]BPF.Insn{
        .mov(.r0, 4),
        .exit(),
    };

    var verifier_log: [0x10000]u8 = undefined;
    var log = BPF.Log{ .buf = &verifier_log, .level = 2 };
    const progfd = try BPF.prog_load(.socket_filter, &insns, &log, "GPL v2"¹, 0, 0);

    std.debug.print("BPF Verifier output:\n{s}", .{std.mem.sliceTo(&verifier_log, 0)});

    var socks: [2]linux.fd_t = undefined;
    switch (posix.errno(linux.socketpair(AF.UNIX, SOCK.DGRAM, 0, &socks))) {
        .SUCCESS => {},
        else => |e| return posix.unexpectedErrno(e),
    }
    switch (posix.errno(linux.setsockopt(socks[0], SOL.SOCKET, SO.ATTACH_BPF, std.mem.asBytes(&progfd), 4))) {
        .SUCCESS => {},
        else => |e| return posix.unexpectedErrno(e),
    }

    const input = "Hello";
    _ = try posix.write(socks[1], input);

    var buf: [input.len]u8 = undefined;
    const n_read = try posix.read(socks[0], &buf);

    std.log.info("Sent '{s}', received '{s}'", .{ input, buf[0..n_read] });
}

BPF Verifier output:
func#0 @0
Live regs before insn:
      0: .......... (b7) r0 = 4
      1: 0......... (95) exit
0: R1=ctx() R10=fp0
0: (b7) r0 = 4                        ; R0=4
1: (95) exit
processed 2 insns (limit 1000000) max_states_per_insn 0 total_states 0 peak_states 0 mark_read 0
info: Sent 'Hello', received 'Hell'

A socket filter is the OG type of eBPF program (and the namesake for BPF), but in addition to high performance packet filtering, eBPF can be used for syscall tracing, perf counter monitoring, security policy enforcement, and everything in between (BPF_PROG_TYPE_XDP programs can even be offloaded to run directly on the NIC, bypassing the kernel entirely²).

Now is a good time to mention that there are many restrictions on how unprivileged users (those without CAP_SYS_ADMIN or CAP_BPF) can interact with eBPF—as there should be, because a verified eBPF program runs with more or less the same privileges as a kernel module. The restrictions that I know of (not exhaustive) are as follows:

Programs are limited to 4096 instructions
There are 79 types of eBPF programs³, but regular users are only allowed to use socket filters and cgroup socket buffers
- Even for programs with CAP_BPF, many program types are gated behind CAP_NET_ADMIN and/or CAP_PERFMON
No subprograms (i.e. static functions)
No back edges (and consequently no loops)
No KFuncs
Most eBPF helpers are off-limits
Sanitation
- Arithmetic with constant values in registers is changed to use those constants directly (ALU sanitation)⁴
- Dead code is turned into ja -1 to force either an infinite loop or exit instead of potentially executing beyond the bounds of the program

Clearly a lot of the more interesting stuff is beyond the reach of unprivileged users, so I will henceforth be assuming that our process has CAP_BPF, CAP_NET_ADMIN and CAP_PERFMON (although not root).

Passing data between kernel and user space

eBPF maps are data structures used for sharing data between eBPF programs and user space (or other eBPF programs). Contrary to the name, there are more types of eBPF maps than just hashmaps, and eBPF maps are also used for other purposes like storing string constants.

To demonstrate usage of eBPF maps, let’s write a simple eBPF program that drops a packet if it contains the substring “foobar”.

The strncmp helper function looks like it could be useful, but it’s a little unclear what the eBPF equivalent of a char * is. Let’s check the source:

static const struct bpf_func_proto bpf_strncmp_proto = {
	.func       = bpf_strncmp,
	.gpl_only   = false,
	.ret_type   = RET_INTEGER,
	.arg1_type  = ARG_PTR_TO_MEM | MEM_RDONLY,
	.arg2_type  = ARG_CONST_SIZE,
	.arg3_type  = ARG_PTR_TO_CONST_STR,
};

Arguments 1 and 2 are self-explanatory, but what about argument 3? Further inspection reveals that ARG_PTR_TO_CONST_STR is checked by the function check_reg_const_str, which ensures that a register points to an element (a null-terminated string) of a read-only eBPF map.

Here’s how we’ll create such a map:

Zig

fn bpf_string_map(str: [:0]const u8) !posix.fd_t {
    var attr = BPF.Attr{
        .map_create = std.mem.zeroes(BPF.MapCreateAttr),
    };

    attr.map_create.map_type = @intFromEnum(BPF.MapType.array);
    attr.map_create.key_size = @sizeOf(i32);
    attr.map_create.value_size = @sizeOf(u64);
    attr.map_create.max_entries = 1;
    // make this map read-only from the ebpf program
    attr.map_create.map_flags = BPF.BPF_F_RDONLY_PROG;

    const rc = linux.bpf(.map_create, &attr, @sizeOf(BPF.MapCreateAttr));
    const fd: posix.fd_t = switch (posix.errno(rc)) {
        .SUCCESS => @intCast(rc),
        .INVAL => return error.MapTypeOrAttrInvalid,
        .NOMEM => return error.SystemResources,
        .PERM => return error.AccessDenied,
        else => |err| return posix.unexpectedErrno(err),
    };

    try BPF.map_update_elem(fd, &std.mem.toBytes(@as(i32, 0)), str, BPF.ANY);

    attr = BPF.Attr{
        .map_elem = std.mem.zeroes(BPF.MapElemAttr),
    };
    attr.map_elem.map_fd = fd;
    // make this map read-only from userspace
    try switch (posix.errno(linux.bpf(.map_freeze, &attr, @sizeOf(BPF.MapElemAttr)))) {
        .SUCCESS => {},
        else => |err| posix.unexpectedErrno(err),
    };

    return fd;
}

Now we can iterate over the packet data one byte at a time, returning DROP if strncmp returns 0.

Zig

const SK = enum(i32) {
    DROP = 0,
    PASS,
};

const mapfd = try bpf_string_map("foobar");

const insns = [_]BPF.Insn{
    .mov(.r6, .r1),
    .stx(.double_word, .r10, -0x8, .r1),
    .st(.double_word, .r10, -0x10, 0),

    // return if packet_len < 6 bytes
    .ldx(.double_word, .r1, .r10, -0x8),
    .ldx(.word, .r7, .r1, 0),
    .jmp(.jge, .r7, 6, 2),
    .mov(.r0, @intFromEnum(SK.PASS)),
    .exit(),

    // load "foobar" into r9
    .ld_map_fd1(.r1, mapfd),
    .ld_map_fd2(mapfd),
    .mov(.r2, .r10),
    .add(.r2, -0x10),
    .call(.map_lookup_elem),
    .jmp(.jne, .r0, 0, 2),
    .mov(.r0, @intFromEnum(SK.DROP)),
    .exit(),
    .mov(.r9, .r0),

    // begin checking for "foobar"
    .mov(.r8, 0),
    .mov(.r1, .r6),
    .mov(.r2, .r8),
    .mov(.r3, .r10),
    .add(.r3, -0x18),
    .mov(.r4, 6),
    .call(.skb_load_bytes),
    .jmp(.jlt, .r0, 0, 10),

    // drop packet if it contains "foobar"
    .mov(.r1, .r10),
    .add(.r1, -0x18),
    .mov(.r2, 6),
    .mov(.r3, .r9),
    // bug in Zig std, call helper manually
    // .call(.strncmp),
    .{
        .code = BPF.CALL | BPF.JMP,
        .dst = 0,
        .src = 0,
        .off = 0,
        .imm = 182,
    },
    .jmp(.jne, .r0, 0, 2),
    .mov(.r0, @intFromEnum(SK.DROP)),
    .exit(),
    .add(.r8, 1),
    .jmp(.jlt, .r8, 0x200, -17),

    .mov(.r0, @intFromEnum(SK.PASS)),
    .exit(),
};

var verifier_log: [0x10000]u8 = undefined;
var log = BPF.Log{ .buf = &verifier_log, .level = 4 };
errdefer std.debug.print("BPF Verifier output:\n{s}", .{std.mem.sliceTo(&verifier_log, 0)});

const progfd = try BPF.prog_load(.sk_skb, &insns, &log, "GPL v2", 0, 0);

const sockmapfd = try BPF.map_create(.sockmap, @sizeOf(i32), @sizeOf(i32), 1);

{
    var attr = BPF.Attr{
        .prog_attach = .{
            .target_fd = sockmapfd,
            .attach_bpf_fd = progfd,
            .attach_type = @intFromEnum(BPF.AttachType.sk_skb_stream_verdict),
            .attach_flags = 0,
            .replace_bpf_fd = 0,
        },
    };

    try switch (posix.errno(linux.bpf(.prog_attach, &attr, @sizeOf(BPF.ProgAttachAttr)))) {
        .SUCCESS => {},
        .ACCES => error.UnsafeProgram,
        .FAULT => unreachable,
        .INVAL => error.InvalidProgram,
        .PERM => error.PermissionDenied,
        else => |err| posix.unexpectedErrno(err),
    };
}

var socks: [2]linux.fd_t = undefined;
switch (posix.errno(linux.socketpair(AF.UNIX, SOCK.DGRAM | SOCK.NONBLOCK, 0, &socks))) {
    .SUCCESS => {},
    else => |e| return posix.unexpectedErrno(e),
}
try BPF.map_update_elem(sockmapfd, &std.mem.toBytes(@as(i32, 0)), std.mem.asBytes(&socks[0]), BPF.ANY);

const packets = [_][]const u8{
    "foo",
    "bar",
    "foobar",
    "fooba",
    "snafu",
    "ffoobarbaz",
    "bazbarfoo",
};

var buf: [0x10]u8 = undefined;
for (packets) |p| {
    _ = try posix.write(socks[1], p);
    const n_read = posix.read(socks[0], &buf) catch 0;
    std.log.info("Sent '{s}', received '{s}'", .{ p, buf[0..n_read] });
}

info: Sent 'foo', received 'foo'
info: Sent 'bar', received 'bar'
info: Sent 'foobar', received ''
info: Sent 'fooba', received 'fooba'
info: Sent 'snafu', received 'snafu'
info: Sent 'ffoobarbaz', received ''
info: Sent 'bazbarfoo', received 'bazbarfoo'

Type information (BTF)

Even though the eBPF verifier uses type information extensively to prove correctness of a program, almost all eBPF instructions don’t encode type information (besides integer width), so how do we represent types in eBPF?

That’s where BTF (BPF type format) comes in. It’s described as a format that “encodes the debug info related to BPF program/map,” but in my opinion this is misleading as it encodes information useful for debugging and information that is used by the eBPF verifier to check that your program is valid.

That is to say, eBPF programs that use certain features require BTF information in order to pass verification.⁵

To see this in action, let’s rewrite the previous program using the bpf_loop helper and the bpf_strcmp KFunc.

According to the docs the second argument of bpf_loop, void *callback, is a pointer to a static function. But wait, how do you define a (static) function within an eBPF program? The preferred verbiage for functions within a program seems to be “subprograms”, which are subsequences of instructions within the list of instructions that you pass to the bpf syscall.

To let the verifier know that your subprogram exists, you need to use a special variant of the CALL or LD opcodes, which jump to a program-local function or load a program-local function pointer into a register, respectively.

Zig

const main_insns = [_]BPF.Insn{
    .stx(.double_word, .r10, -0x8, .r1),
    .st(.double_word, .r10, -0x10, 0),
    .mov(.r1, .r10),
    .add(.r1, -0x8),
    .stx(.double_word, .r10, -0x18, .r1),

    // return if packet_len < 6 bytes
    .ldx(.double_word, .r1, .r10, -0x8),
    .ldx(.word, .r7, .r1, 0),
    .jmp(.jge, .r7, 6, 2),
    .mov(.r0, @intFromEnum(SK.PASS)),
    .exit(),

    .mov(.r1, .r7),
    .add(.r1, -5),
    _ld_funcall1(.r2, 11),
    _ld_funcall2(11),
    .mov(.r3, .r10),
    .add(.r3, -0x18),
    .mov(.r4, 0),
    // .call(.loop),
    .{
        .code = BPF.CALL | BPF.JMP,
        .dst = 0,
        .src = 0,
        .off = 0,
        .imm = 181,
    },
    .ldx(.double_word, .r1, .r10, -0x10),
    .jmp(.jeq, .r1, 1, 2),
    .mov(.r0, @intFromEnum(SK.PASS)),
    .exit(),

    .mov(.r0, @intFromEnum(SK.DROP)),
    .exit(),
};

const helper_insns = [_]BPF.Insn{
    // r1 = loop index, r2 = ctx
    .stx(.double_word, .r10, -0x8, .r2),
    .st(.double_word, .r10, -0x10, 0),
    .mov(.r3, .r1),

    // load skb->data[index..][0..6] onto the stack
    .ldx(.double_word, .r1, .r2, 0),
    .ldx(.double_word, .r1, .r1, 0),
    .mov(.r2, .r3),
    .mov(.r3, .r10),
    .add(.r3, -0x18),
    .mov(.r4, 6),
    .call(.skb_load_bytes),
    .jmp(.jeq, .r0, 0, 2),
    .mov(.r0, 1),
    .exit(),

    // load "foobar" into r9
    .ld_map_fd1(.r1, mapfd),
    .ld_map_fd2(mapfd),
    .mov(.r2, .r10),
    .add(.r2, -0x10),
    .call(.map_lookup_elem),
    .jmp(.jne, .r0, 0, 2),
    .mov(.r0, 1),
    .exit(),
    .mov(.r9, .r0),

    .mov(.r1, .r10),
    .add(.r1, -0x18),
    .mov(.r2, .r9),
    .{
        .code = BPF.CALL | BPF.JMP,
        .dst = 0,
        .src = 2,
        .off = 0,
        .imm = strcmp_btf_id,
    },
    .jmp(.jne, .r0, 0, 5),
    .ldx(.double_word, .r1, .r10, -0x8),
    .ldx(.double_word, .r1, .r1, 0),
    .st(.double_word, .r1, -0x8, 1), // store "1" on the callee's stack past the sk_buff ctx pointer
    .mov(.r0, 1),
    .exit(),

    .mov(.r0, 0),
    .exit(),
};

BPF Verifier output:
missing btf func_info
verification time 13905 usec
stack depth 24+0
processed 13 insns (limit 1000000) max_states_per_insn 0 total_states 1 peak_states 1 mark_read 0
error: InvalidProgram
/nix/store/pp3rdgdy6pnji9zm91qqcd6c86wljw58-zig-0.15.2/lib/zig/std/os/linux/bpf.zig:1710:5: 0x10c0602 in prog_load (tmp.vvZfe6NDKR)
/tmp/nix-shell.6Qivnq/tmp.vvZfe6NDKR.zig:167:20: 0x10c15cf in main (tmp.vvZfe6NDKR)

Ok, now the verifier is upset because we’re missing btf func_info. Basically, if we want to use callbacks, we need to tell the verifier the type signatures of all of our subprograms.

When loading our program, one of the attributes we can pass in is func_info, which is a pointer to an array of bpf_func_info structs, each of which contains an offset of a subprogram and a “type id” for a BTF_KIND_FUNC_PROTO representing that subprogram’s type information. A type id is basically an index into the list of types contained within a BTF object; because a BPF map or program can only reference types within a single BTF object (not counting btf_vmlinux), for all intents and purposes you should shove any type information your program will need into one BTF object.

Fortunately, constructing a BTF object is fairly straightforward (if not a little tedious), so I will let the code do the talking:

Zig

const BTF_FUNC = enum(u32) {
    STATIC = 0,
    GLOBAL,
    EXTERN,
};

const bpf_func_info = extern struct {
    insn_off: u32,
    type_id: u32,
};

const BPFContext = opaque {};

const BTFContext = struct {
    type_map: []const struct { []const u8, u32 } = &.{.{ "void", 0 }},
    strings: []const u8 = "\x00",
    btf_bytes: []const u8 = &.{},
    type_id: u32 = 1,

    pub fn indexOf(self: *const @This(), T: type) ?usize {
        inline for (self.type_map) |typ| {
            if (std.mem.eql(u8, @typeName(T), typ.@"0")) return typ.@"1";
        }
        return null;
    }
};

fn type_to_btf_context(T: type, comptime ctx: BTFContext) BTFContext {
    comptime var ret = ctx;

    @setEvalBranchQuota(2000000);

    if (BTFContext.indexOf(&ctx, T)) |_| {
        return ctx;
    } else {
        ret.type_map = ret.type_map ++ .{.{ @typeName(T), ret.type_id }};
        ret.type_id += 1;

        switch (@typeInfo(T)) {
            .int => |info| {
                const encoding: u32 = if (info.signedness == .signed) 1 else if (T == u8) 2 else 0;
                ret.btf_bytes = ret.btf_bytes ++ std.mem.asBytes(&BPF.btf.Type{
                    .name_off = 0,
                    .info = .{
                        .vlen = 0,

                        .unused_1 = 0,
                        .kind = .int,
                        .unused_2 = 0,

                        .kind_flag = false,
                    },
                    .size_type = .{ .size = (info.bits + 7) / 8 * 8 },
                }) ++ std.mem.toBytes(@as(u32, (encoding << 24) + @as(u32, 0 << 16) + info.bits));
            },
            .bool => {
                const encoding: u32 = 4;
                ret.btf_bytes = ret.btf_bytes ++ std.mem.asBytes(&BPF.btf.Type{
                    .name_off = 0,
                    .info = .{
                        .vlen = 0,

                        .unused_1 = 0,
                        .kind = .int,
                        .unused_2 = 0,

                        .kind_flag = false,
                    },
                    .size_type = .{ .size = @sizeOf(bool) },
                }) ++ std.mem.toBytes(@as(u32, (encoding << 24) + @as(u32, 0 << 16) + @bitSizeOf(bool)));
            },
            .float => |info| {
                ret.btf_bytes = ret.btf_bytes ++ std.mem.asBytes(&BPF.btf.Type{
                    .name_off = 0,
                    .info = .{
                        .vlen = 0,

                        .unused_1 = 0,
                        .kind = .float,
                        .unused_2 = 0,

                        .kind_flag = false,
                    },
                    .size_type = .{ .size = (info.bits + 7) / 8 * 8 },
                });
            },
            .@"enum" => |info| {
                const enum_size = switch (@sizeOf(info.tag_type)) {
                    1, 2, 4, 8 => |s| s,
                    else => @compileError("enum size must be 1/2/4/8 bytes"),
                };

                ret.btf_bytes = ret.btf_bytes ++ std.mem.asBytes(&BPF.btf.Type{
                    .name_off = 0,
                    .info = .{
                        .vlen = info.fields.len,

                        .unused_1 = 0,
                        .kind = if (enum_size == 8) .enum64 else .@"enum",
                        .unused_2 = 0,

                        .kind_flag = info.signedness,
                    },
                    .size_type = .{ .size = enum_size },
                });

                inline for (info.fields) |f| {
                    if (enum_size == 8) {
                        ret.btf_bytes = ret.btf_bytes ++ std.mem.asBytes(&BPF.btf.Enum64{
                            .name_off = ret.strings.len,
                            .val_lo32 = @as(i32, @bitCast(@as(u32, @truncate(f.value)))),
                            .val_hi32 = @as(i32, @bitCast(@as(u32, @truncate(f.value >> 32)))),
                        });
                    } else {
                        ret.btf_bytes = ret.btf_bytes ++ std.mem.asBytes(&BPF.btf.Enum{
                            .name_off = ret.strings.len,
                            .val = @bitCast(f.value),
                        });
                    }
                    ret.strings = ret.strings ++ f.name;
                }
            },
            .pointer => |info| {
                if (info.child == BPFContext and BTFContext.indexOf(&ret, BPFContext) == null) {
                    ret.btf_bytes = ret.btf_bytes ++
                        std.mem.asBytes(&BPF.btf.Type{
                            .name_off = ret.strings.len,
                            .info = .{
                                .vlen = 0,

                                .unused_1 = 0,
                                .kind = .decl_tag,
                                .unused_2 = 0,

                                .kind_flag = false,
                            },
                            .size_type = .{ .typ = 1 },
                        }) ++ std.mem.asBytes(&BPF.btf.DeclTag{
                        .component_idx = 0,
                    }) ++ std.mem.asBytes(&BPF.btf.Type{
                        .name_off = 0,
                        .info = .{
                            .vlen = 0,

                            .unused_1 = 0,
                            .kind = .ptr,
                            .unused_2 = 0,

                            .kind_flag = false,
                        },
                        .size_type = .{ .typ = 0 },
                    });
                    ret.strings = ret.strings ++ "arg:ctx\x00";
                    ret.type_map = ret.type_map ++ .{.{ @typeName(*void), ret.type_id - 1 }};
                } else {
                    comptime var _ctx = ret;
                    _ctx.btf_bytes = &.{};
                    const new = type_to_btf_context(info.child, _ctx);
                    _ctx.btf_bytes = _ctx.btf_bytes ++
                        std.mem.asBytes(&BPF.btf.Type{
                            .name_off = 0,
                            .info = .{
                                .vlen = 0,

                                .unused_1 = 0,
                                .kind = .ptr,
                                .unused_2 = 0,

                                .kind_flag = false,
                            },
                            .size_type = .{ .typ = BTFContext.indexOf(&new, info.child).? },
                        }) ++ std.mem.asBytes(&BPF.btf.DeclTag{
                        .component_idx = 0,
                    }) ++ new.btf_bytes;
                    _ctx.type_id = new.type_id;
                    _ctx.strings = new.strings;

                    ret = _ctx;
                }
            },
            .@"fn" => |info| {
                const cc: u16 = switch (info.calling_convention) {
                    .auto => @intFromEnum(BTF_FUNC.STATIC),
                    .naked => @intFromEnum(BTF_FUNC.GLOBAL),
                    else => @compileError("only auto and naked calling conventions are allowed"),
                };

                comptime var _ctx = ret;
                _ctx.btf_bytes = &.{};
                _ctx.type_id += 1;
                _ctx = type_to_btf_context(info.return_type.?, _ctx);

                ret.btf_bytes = ret.btf_bytes ++ std.mem.asBytes(&BPF.btf.Type{
                    .name_off = _ctx.strings.len,
                    .info = .{
                        .vlen = cc,

                        .unused_1 = 0,
                        .kind = .func,
                        .unused_2 = 0,

                        .kind_flag = false,
                    },
                    .size_type = .{ .typ = ret.type_id },
                }) ++ std.mem.asBytes(&BPF.btf.Type{
                    .name_off = 0,
                    .info = .{
                        .vlen = info.params.len,

                        .unused_1 = 0,
                        .kind = .func_proto,
                        .unused_2 = 0,

                        .kind_flag = false,
                    },
                    .size_type = .{ .typ = BTFContext.indexOf(&_ctx, info.return_type.?).? },
                });
                _ctx.strings = _ctx.strings ++ "foo\x00";

                inline for (info.params) |p| {
                    if (p.type) |t| {
                        _ctx.type_id += 1;
                        _ctx = type_to_btf_context(t, _ctx);
                        _ctx.type_id -= 1;
                        ret.btf_bytes = ret.btf_bytes ++ std.mem.asBytes(&BPF.btf.Param{
                            .name_off = _ctx.strings.len,
                            .typ = BTFContext.indexOf(&_ctx, t).?,
                        });
                        _ctx.strings = _ctx.strings ++ "foo\x00";
                    } else @compileError("param type cannot be null");
                }
                ret.btf_bytes = ret.btf_bytes ++ _ctx.btf_bytes;
                ret.type_id = _ctx.type_id;
                ret.strings = _ctx.strings;
                ret.type_map = _ctx.type_map;
            },
            .array, .@"struct", .@"union" => @compileError("unimplemented"),
            else => @compileError(@typeName(T) ++ " is unimplemented or unsupported"),
        }
        return ret;
    }
}

fn prog_load_with_btf(prog_type: BPF.ProgType, progs: []const []const BPF.Insn, comptime funcs: []const type) !posix.fd_t {
    if (funcs.len > 32) {
        @compileError("32 arguments max are supported");
    }

    comptime var ctx = BTFContext{};
    comptime var func_type_ids: [funcs.len]u32 = undefined;
    inline for (funcs, &func_type_ids) |f, *ft| {
        ctx = comptime type_to_btf_context(f, ctx);
        ctx.type_id += 1;
        ft.* = comptime BTFContext.indexOf(&ctx, f).?;
    }

    const btf = std.mem.asBytes(&BPF.btf.Header{
        .magic = BPF.btf.magic,
        .version = 1,
        .flags = 0,
        .hdr_len = @sizeOf(BPF.btf.Header),
        .type_off = 0,
        .type_len = ctx.btf_bytes.len,
        .str_off = ctx.btf_bytes.len,
        .str_len = ctx.strings.len,
    }) ++ ctx.btf_bytes ++ ctx.strings;

    var btf_log: [0x10000]u8 = undefined;
    defer std.debug.print("BTF Verifier output:\n{s}", .{std.mem.sliceTo(&btf_log, 0)});

    var attr = BPF.Attr{
        .btf_load = std.mem.zeroes(BPF.BtfLoadAttr),
    };
    attr.btf_load.btf = @intFromPtr(btf);
    attr.btf_load.btf_size = btf.len;
    attr.btf_load.btf_log_buf = @intFromPtr(&btf_log);
    attr.btf_load.btf_log_size = btf_log.len;
    attr.btf_load.btf_log_level = 2;

    var rc = linux.bpf(.btf_load, &attr, @sizeOf(BPF.BtfLoadAttr));
    const btf_fd = try switch (posix.errno(rc)) {
        .SUCCESS => @as(posix.fd_t, @intCast(rc)),
        .ACCES => error.UnsafeProgram,
        .FAULT => unreachable,
        .INVAL => error.InvalidProgram,
        .PERM => error.AccessDenied,
        else => |err| posix.unexpectedErrno(err),
    };

    var func_info: [funcs.len]bpf_func_info = undefined;
    var offset: u32 = 0;
    var insns: [4096]BPF.Insn = undefined;
    for (&func_info, func_type_ids, progs) |*fi, ft, p| {
        fi.* = .{ .insn_off = offset, .type_id = ft };
        @memcpy(insns[offset..][0..p.len], p);
        offset += @intCast(p.len);
    }

    const license = "GPL v2";
    const kernel_version = 0;
    const flags = 0;

    var bpf_log: [0x10000]u8 = undefined;
    errdefer std.debug.print("BPF Verifier output:\n{s}", .{std.mem.sliceTo(&bpf_log, 0)});

    attr = BPF.Attr{
        .prog_load = std.mem.zeroes(BPF.ProgLoadAttr),
    };

    attr.prog_load.prog_type = @intFromEnum(prog_type);
    attr.prog_load.insns = @intFromPtr(&insns);
    attr.prog_load.insn_cnt = offset;
    attr.prog_load.license = @intFromPtr(license.ptr);
    attr.prog_load.kern_version = kernel_version;
    attr.prog_load.prog_flags = flags;
    attr.prog_load.log_buf = @intFromPtr(&bpf_log);
    attr.prog_load.log_size = @intCast(bpf_log.len);
    attr.prog_load.log_level = 2;
    attr.prog_load.prog_btf_fd = btf_fd;
    attr.prog_load.func_info_rec_size = @sizeOf(bpf_func_info);
    attr.prog_load.func_info = @intFromPtr(&func_info);
    attr.prog_load.func_info_cnt = @intCast(func_info.len);

    rc = linux.bpf(.prog_load, &attr, @sizeOf(BPF.ProgLoadAttr));
    return switch (posix.errno(rc)) {
        .SUCCESS => @as(posix.fd_t, @intCast(rc)),
        .ACCES => error.UnsafeProgram,
        .FAULT => unreachable,
        .INVAL => error.InvalidProgram,
        .PERM => error.AccessDenied,
        else => |err| posix.unexpectedErrno(err),
    };
}

I make no claims that the code above is fundamentally correct (several BTF types are left unimplemented, not to mention I doubt it’s idiomatic Zig), but it gets the job done for our purposes.

Once we have our BTF, we load it using the BPF_BTF_LOAD subcommand, and we can then reference it by its fd when creating maps or programs.

Zig

const progfd = try prog_load_with_btf(.sk_skb, &.{&main_insns, &helper_insns}, &.{fn (*BPFContext) callconv(.naked) u32, fn (u64, *void) u64});

The next thing to deal with is how to call a KFunc. There’s a variant of the CALL opcode that calls a function by BTF ID for exactly this purpose, so the next logical question is: how do we find the BTF ID for our KFunc?

The normal way to do this is generate vmlinux.h with pahole/bpftools (bpftool btf dump file /sys/kernel/btf/vmlinux format c > vmlinux.h), but in the spirit of roughing it, we’ll forego that in favor of parsing btf_vmlinux directly.

I say “parse”, but we actually only care about a specific BTF_KIND_FUNC with the name we’re searching for, so my implementation ignores everything besides that.

Zig

const builtin = @import("builtin");

fn get_vmlinux_btf_id_from_name(name: []const u8) !u32 {
    const fd = try std.posix.open("/sys/kernel/btf/vmlinux", .{ .ACCMODE = .RDONLY }, 0o550);
    defer std.posix.close(fd);

    var buf: [0x80]u8 = undefined;
    var f_reader = std.fs.File.reader(.{ .handle = fd }, &buf);
    var reader = &f_reader.interface;

    var btf_id: u32 = 0;

    const header = try reader.takeStruct(BPF.btf.Header, builtin.cpu.arch.endian());
    const string_off = header.hdr_len + header.str_off;
    while (f_reader.logicalPos() < string_off) {
        btf_id += 1;
        const btf_type = try reader.takeStruct(BPF.btf.Type, builtin.cpu.arch.endian());
        switch (btf_type.info.kind) {
            .func => {
                const pos = f_reader.logicalPos();
                try f_reader.seekTo(string_off + btf_type.name_off);
                if (std.mem.eql(u8, try reader.takeDelimiterExclusive('\x00'), name)) return btf_id;
                try f_reader.seekTo(pos);
            },
            .int => _ = try reader.takeStruct(extern struct { _: u32 }, builtin.cpu.arch.endian()),
            .array => _ = try reader.takeStruct(BPF.btf.Array, builtin.cpu.arch.endian()),
            .@"struct", .@"union" => for (0..btf_type.info.vlen) |_| {
                _ = try reader.takeStruct(BPF.btf.Member, builtin.cpu.arch.endian());
            },
            .@"enum" => for (0..btf_type.info.vlen) |_| {
                _ = try reader.takeStruct(BPF.btf.Enum, builtin.cpu.arch.endian());
            },
            .enum64 => for (0..btf_type.info.vlen) |_| {
                _ = try reader.takeStruct(BPF.btf.Enum64, builtin.cpu.arch.endian());
            },
            .func_proto => for (0..btf_type.info.vlen) |_| {
                _ = try reader.takeStruct(BPF.btf.Param, builtin.cpu.arch.endian());
            },
            .@"var" => _ = try reader.takeStruct(BPF.btf.Var, builtin.cpu.arch.endian()),
            .datasec => for (0..btf_type.info.vlen) |_| {
                _ = try reader.takeStruct(BPF.btf.VarSecInfo, builtin.cpu.arch.endian());
            },
            .decl_tag => _ = try reader.takeStruct(BPF.btf.DeclTag, builtin.cpu.arch.endian()),
            else => {},
        }
    }
    return error.NotFound;
}

BTF Verifier output:
magic: 0xeb9f
version: 1
flags: 0x0
hdr_len: 24
type_off: 0
type_len: 132
str_off: 132
str_len: 29
btf_total_size: 185
[1] FUNC foo type_id=2
[2] FUNC_PROTO (anon) return=3 args=(5 foo)
[3] INT (anon) size=32 bits_offset=0 nr_bits=32 encoding=(none)
[4] DECL_TAG arg:ctx type=1 component_idx=0
[5] PTR (anon) type_id=0
[6] FUNC foo type_id=7
[7] FUNC_PROTO (anon) return=8 args=(8 foo, 5 foo)
[8] INT (anon) size=64 bits_offset=0 nr_bits=64 encoding=(none)
info: Sent 'foo', received 'foo'
info: Sent 'bar', received 'bar'
info: Sent 'foobar', received ''
info: Sent 'fooba', received 'fooba'
info: Sent 'snafu', received 'snafu'
info: Sent 'ffoobarbaz', received ''
info: Sent 'bazbarfoo', received 'bazbarfoo'

Hopefully this has been educational and helpful if you have the ~~misfortune~~ pleasure of working with eBPF at the bytecode level, whether you’re a security researcher hunting for bugs in the kernel or you just want to better understand how libbpf works.

Happy hacking!

Nix flake.nix

{
  description = "Rush E(bpf)";

  inputs = {
    nixpkgs.url = "github:NixOS/nixpkgs/nixpkgs-unstable";
    flake-utils.url = "github:numtide/flake-utils";
  };

  outputs = { self, nixpkgs, flake-utils }:
    (flake-utils.lib.eachDefaultSystem (system:
      let
        inherit (nixpkgs) lib;
        fetchpatch' = (import nixpkgs { inherit system; }).fetchpatch;
        pkgs' = (import nixpkgs { inherit system; }).applyPatches {
          name = "fix-libcap-cross-compile";
          src = nixpkgs;
          patches = [ (fetchpatch' {
            url = "https://patch-diff.githubusercontent.com/raw/NixOS/nixpkgs/pull/461685.patch";
            hash = "sha256-FXDc234uF05woQLYRvfVyA3FboWSmXnPhYOm8PeVs6Y=";
          }) ];
        };
        pkgs = import pkgs' { inherit system; };
      in
        {
          packages = {
            kernel = pkgs.pkgsCross.gnu64.linuxKernel.kernels.linux_6_18;
            initramfs = pkgs.runCommand "build-initramfs" {}
              ''
                mkdir initramfs; cd initramfs
                mkdir -pv {etc,proc,sys,usr/{bin,sbin}}
                cp -a ${pkgs.pkgsCross.gnu64.pkgsStatic.busybox}/{bin,sbin} .
                chmod 755 ./{bin,sbin}
                cp -a ${pkgs.pkgsCross.gnu64.pkgsStatic.libcap}/bin .

                cat <<EOF > init
                #!/bin/sh
                mount -t proc none /proc
                mount -t sysfs none /sys
                mount -t devtmpfs devtmpfs /dev

                setcap 'cap_bpf=eip cap_net_admin=eip cap_perfmon=eip' /prog

                echo -e "\nBoot took \$(cut -d' ' -f1 /proc/uptime) seconds\n"

                setsid cttyhack setuidgid 1337 sh

                umount /proc
                poweroff -d 0 -f
                EOF
                chmod +x init

                find . -print0 | ${lib.getExe pkgs.cpio} --null -ov --format=newc > $out
              '';
            run = pkgs.writeShellScript "run.sh" ''
              ${pkgs.qemu}/bin/qemu-system-x86_64 \
                  ''${DEBUG:+ -s} \
                  -m 512 \
                  -kernel ${self.packages.${system}.kernel}/bzImage \
                  -initrd rootfs.cpio \
                  -append "console=ttyS0 loglevel=3 oops=panic panic=-1" \
                  -no-reboot \
                  -nographic \
                  -monitor /dev/null \
                  -serial unix:vm.sock,server,nowait
            '';
          };

          devShells.default = with pkgs; mkShellNoCC {
            packages = [
              zig_0_15
              linux-scripts
            ];
            # https://github.com/NixOS/nixpkgs/pull/479423
            shellHook = "unset ZIG_GLOBAL_CACHE_DIR";
          };
        }
    ));
}

eBPF the Hard Way

eBPF Hello World

Passing data between kernel and user space

Type information (BTF)