15. 15
ebpf - The Linux bpf syscall
• kernel/bpf/syscall.c: The Linux kernel code related to
the bpf syscall.
• include/uapi/linux/bpf.h: The bpf header file for
assisting in using the bpf syscall.
19. 19
bpf() system call
From man-page bpf(2):
NAME
bpf - perform a command on an extended BPF map or program
SYNOPSIS
#include <linux/bpf.h>
int bpf(int cmd, union bpf_attr *attr, unsigned int size);
include/linux/syscalls.h
asmlinkage long sys_bpf(int cmd, union bpf_attr *attr, unsigned int size);
(749730ce42a2 bpf: enable bpf syscall on x64 and i386 (v3.18-rc1))
27. 27
Create a map with given type and
attributes
map_fd = bpf(BPF_MAP_CREATE, union bpf_attr
*attr, u32 size)
using attr->map_type, attr->key_size, attr-
>value_size, attr->max_entries
db20fd2b0108
bpf: add lookup/update/delete/iterate methods to BPF maps (v3.18-rc1)
28. 28
Create a map with given type and
attributes
Userspace example:
int bpf_create_map(enum bpf_map_type map_type, int key_size,
int value_size, int max_entries)
{
union bpf_attr attr = {
.map_type = map_type,
.key_size = key_size,
.value_size = value_size,
.max_entries = max_entries
};
return bpf(BPF_MAP_CREATE, &attr, sizeof(attr));
}
--
99c55f7d47c0 bpf: introduce BPF syscall and maps (v3.18-rc1)
29. 29
Create a map with given type and
attributes
Userspace example:
int bpf_create_map(enum bpf_map_type map_type, int key_size,
int value_size, int max_entries)
{
union bpf_attr attr = {
.map_type = map_type,
.key_size = key_size,
.value_size = value_size,
.max_entries = max_entries
};
return bpf(BPF_MAP_CREATE, &attr, sizeof(attr));
}
--
99c55f7d47c0 bpf: introduce BPF syscall and maps (v3.18-rc1)
member of bpf_attr union for BPF_MAP_CREATE
struct { /* anonymous struct used by BPF_MAP_CREATE command */
__u32 map_type; /* one of enum bpf_map_type */
__u32 key_size; /* size of key in bytes */
__u32 value_size; /* size of value in bytes */
__u32 max_entries; /* max number of entries in a map */
};
30. 30
lookup key in a given map
err = bpf(BPF_MAP_LOOKUP_ELEM, union bpf_attr
*attr, u32 size)
using attr->map_fd, attr->key, attr->value
db20fd2b0108
bpf: add lookup/update/delete/iterate methods to BPF maps (v3.18-rc1)
31. 31
Find and delete element by key in a
given map
err = bpf(BPF_MAP_DELETE_ELEM, union bpf_attr
*attr, u32 size)
using attr->map_fd, attr->key
db20fd2b0108
bpf: add lookup/update/delete/iterate methods to BPF maps (v3.18-rc1)
32. 32
lookup key in a given map
Userspace example:
int bpf_lookup_elem(int fd, const void *key, void *value)
{
union bpf_attr attr = {
.map_fd = fd,
.key = ptr_to_u64(key),
.value = ptr_to_u64(value),
};
return bpf(BPF_MAP_LOOKUP_ELEM, &attr, sizeof(attr));
}
int bpf_delete_elem(int fd, const void *key)
{
union bpf_attr attr = {
.map_fd = fd,
.key = ptr_to_u64(key),
};
return bpf(BPF_MAP_DELETE_ELEM, &attr, sizeof(attr));
}
33. 33
lookup key in a given map
Userspace example:
int bpf_lookup_elem(int fd, const void *key, void *value)
{
union bpf_attr attr = {
.map_fd = fd,
.key = ptr_to_u64(key),
.value = ptr_to_u64(value),
};
return bpf(BPF_MAP_LOOKUP_ELEM, &attr, sizeof(attr));
}
int bpf_delete_elem(int fd, const void *key)
{
union bpf_attr attr = {
.map_fd = fd,
.key = ptr_to_u64(key),
};
return bpf(BPF_MAP_DELETE_ELEM, &attr, sizeof(attr));
}
include/uapi/linux/bpf.h
/* anonymous struct used by BPF_MAP_*_ELEM commands */
struct {
__u32 map_fd;
__aligned_u64 key;
union {
__aligned_u64 value;
__aligned_u64 next_key;
};
__u64 flags;
};
34. 34
Create or update key/value pair in a
given map
err = bpf(BPF_MAP_UPDATE_ELEM, union bpf_attr
*attr, u32 size)
using attr->map_fd, attr->key, attr->value
db20fd2b0108
bpf: add lookup/update/delete/iterate methods to BPF maps (v3.18-rc1)
35. 35
Create or update key/value pair in a
given map
Userspace example:
int bpf_update_elem(int fd, const void *key, const void *value,
uint64_t flags)
{
union bpf_attr attr = {
.map_fd = fd,
.key = ptr_to_u64(key),
.value = ptr_to_u64(value),
.flags = flags,
};
return bpf(BPF_MAP_UPDATE_ELEM, &attr, sizeof(attr));
}
36. 36
Create or update key/value pair in a
given map
Userspace example:
int bpf_update_elem(int fd, const void *key, const void *value,
uint64_t flags)
{
union bpf_attr attr = {
.map_fd = fd,
.key = ptr_to_u64(key),
.value = ptr_to_u64(value),
.flags = flags,
};
return bpf(BPF_MAP_UPDATE_ELEM, &attr, sizeof(attr));
}
include/uapi/linux/bpf.h
/* flags for BPF_MAP_UPDATE_ELEM command */
#define BPF_ANY 0 /* create new element or update existing */
#define BPF_NOEXIST 1 /* create new element if it didn't exist */
#define BPF_EXIST 2 /* update existing element */
64. Unpublished Work of SUSE. All Rights Reserved.
This work is an unpublished work and contains confidential, proprietary and trade secret information of SUSE.
Access to this work is restricted to SUSE employees who have a need to know to perform tasks within the scope of
their assignments. No part of this work may be practiced, performed, copied, distributed, revised, modified, translated,
abridged, condensed, expanded, collected, or adapted without the prior written consent of SUSE.
Any use or exploitation of this work without authorization could subject the perpetrator to criminal and civil liability.
General Disclaimer
This document is not to be construed as a promise by any participating company to develop, deliver, or market a
product. It is not a commitment to deliver any material, code, or functionality, and should not be relied upon in making
purchasing decisions. SUSE makes no representations or warranties with respect to the contents of this document,
and specifically disclaims any express or implied warranties of merchantability or fitness for any particular purpose. The
development, release, and timing of features or functionality described for SUSE products remains at the sole
discretion of SUSE. Further, SUSE reserves the right to revise this document and to make changes to its content, at
any time, without obligation to notify any person or entity of such revisions or changes. All SUSE marks referenced in
this presentation are trademarks or registered trademarks of Novell, Inc. in the United States and other countries. All
third-party trademarks are the property of their respective owners.
Editor's Notes
“ &apos;union bpf_attr&apos; is backwards compatible with future extensions.”
From 99c55f7d47c0 bpf: introduce BPF syscall and maps (v3.18-rc1)
“This is a C union which allows for different C structs to be passed to the bpf syscall depending on which command is being used. The code for it can be found in the include/uapi/linux/bpf.h file of the Linux kernel. The relevant C struct from this C union will be included in code examples that use the bpf_attr union so readers can see the form of the struct being used.”
From https://qmonnet.github.io/whirl-offload/2016/09/01/dive-into-bpf/
The return value for this command is a new file descriptor associated with this eBPF program.
The return value for this command is a new file descriptor associated with this eBPF program.
prog_type is one of the available program types:
For further details of eBPF program types, see below.
The remaining fields of bpf_attr are set as follows:
* insns is an array of struct bpf_insn instructions.
* insn_cnt is the number of instructions in the program referred to by insns.
* license is a license string, which must be GPL compatible to call helper functions marked gpl_only. (The licensing rules are the same as for
kernel modules, so that also dual licenses, such as &quot;Dual BSD/GPL&quot;, may be used.)
* log_buf is a pointer to a caller-allocated buffer in which the in-kernel verifier can store the verification log. This log is a multi-line
string that can be checked by the program author in order to understand how the verifier came to the conclusion that the eBPF program is unsafe.
The format of the output can change at any time as the verifier evolves.
* log_size size of the buffer pointed to by log_buf. If the size of the buffer is not large enough to store all verifier messages, -1 is returned
and errno is set to ENOSPC.
* log_level verbosity level of the verifier. A value of zero means that the verifier will not provide a log; in this case, log_buf must be a NULL
pointer, and log_size must be zero.
The return value for this command is a new file descriptor associated with this eBPF program.
prog_type is one of the available program types:
For further details of eBPF program types, see below.
The remaining fields of bpf_attr are set as follows:
* insns is an array of struct bpf_insn instructions.
* insn_cnt is the number of instructions in the program referred to by insns.
* license is a license string, which must be GPL compatible to call helper functions marked gpl_only. (The licensing rules are the same as for
kernel modules, so that also dual licenses, such as &quot;Dual BSD/GPL&quot;, may be used.)
* log_buf is a pointer to a caller-allocated buffer in which the in-kernel verifier can store the verification log. This log is a multi-line
string that can be checked by the program author in order to understand how the verifier came to the conclusion that the eBPF program is unsafe.
The format of the output can change at any time as the verifier evolves.
* log_size size of the buffer pointed to by log_buf. If the size of the buffer is not large enough to store all verifier messages, -1 is returned
and errno is set to ENOSPC.
* log_level verbosity level of the verifier. A value of zero means that the verifier will not provide a log; in this case, log_buf must be a NULL
pointer, and log_size must be zero.
On success, this operation returns a file descriptor.
On error, -1 is returned and errno is set to EINVAL, EPERM, or ENOMEM.
//// member of bpf_attr union for BPF_MAP_CREATE
//
// struct { /* anonymous struct used by BPF_MAP_CREATE command */
// __u32 map_type; /* one of enum bpf_map_type */
// __u32 key_size; /* size of key in bytes */
// __u32 value_size; /* size of value in bytes */
// __u32 max_entries; /* max number of entries in a map */
// __u32 map_flags; /* prealloc or not */
// };
//// member of bpf_attr union for BPF_MAP_CREATE
//
// struct { /* anonymous struct used by BPF_MAP_CREATE command */
// __u32 map_type; /* one of enum bpf_map_type */
// __u32 key_size; /* size of key in bytes */
// __u32 value_size; /* size of value in bytes */
// __u32 max_entries; /* max number of entries in a map */
// __u32 map_flags; /* prealloc or not */
// };
If an element is found, the operation returns zero and stores the element&apos;s value into value, which must point to a buffer of value_size bytes.
If no element is found, the operation returns -1 and sets errno to ENOENT.
On success, zero is returned.
If the element is not found, -1 is returned and errno is set to ENOENT.
[BPF_MAP_LOOKUP_ELEM]
The BPF_MAP_LOOKUP_ELEM command looks up an element with a given key in the map referred to by the file descriptor fd.
If an element is found, the operation returns zero and stores the element&apos;s value into value, which must point to a buffer of value_size bytes.
[BPF_MAP_DELETE_ELEM]
On success, zero is returned.
If the element is not found, -1 is returned and errno is set to ENOENT.
The BPF_MAP_LOOKUP_ELEM command looks up an element with a given key in the map referred to by the file descriptor fd.
If an element is found, the operation returns zero and stores the element&apos;s value into value, which must point to a buffer of value_size bytes.
On success, the operation returns zero.
On error, -1 is returned and errno is set to EINVAL, EPERM, ENOMEM, or E2BIG.
E2BIG indicates that the number of elements in the map reached the max_entries limit specified at map creation time.
EEXIST will be returned if flags specifies
BPF_NOEXIST and the element with key already exists in the map.
ENOENT will be returned if flags specifies BPF_EXIST and the element with key doesn&apos;t exist in the map.
//// member of bpf_attr union for BPF_MAP_LOOKUP_ELEM, BPF_MAP_UPDATE_ELEM,
//// and BPF_MAP_DELETE_ELEM
//
// struct { /* anonymous struct used by BPF_MAP_*_ELEM commands */
// __u32 map_fd;
// __aligned_u64 key;
// union {
// __aligned_u64 value;
// __aligned_u64 next_key;
// };
// __u64 flags;
// };
The BPF_MAP_UPDATE_ELEM command also allows a flag to be specified which communicates the desired action relative to if a key does or doesn’t already exist when the update action is called. As you can see from the comments in the code block below, this command really is three: set value if no prior value exists, set value only if prior value exists, or set value regardless of if prior value exists.
11 different types as of Linux kernel 4.11.
16 different types as of Linux kernel 4.15.
Along with this, the Linux kernel reserves the first C enum option as BPF_MAP_TYPE_UNSPEC to ensure that zero isn’t a valid map type. Presumably, this is in case zero, with it’s many forms in C, does not accidentally get passed as the map types argument.
11 different types as of Linux kernel 4.11.
16 different types as of Linux kernel 4.15.
Along with this, the Linux kernel reserves the first C enum option as BPF_MAP_TYPE_UNSPEC to ensure that zero isn’t a valid map type. Presumably, this is in case zero, with it’s many forms in C, does not accidentally get passed as the map types argument.
Along with this, the Linux kernel reserves the first C enum option as BPF_MAP_TYPE_UNSPEC to ensure that zero isn’t a valid map type.
Presumably, this is in case zero, with it’s many forms in C, does not accidentally get passed as the map types argument.
enum bpf_map_type {
BPF_MAP_TYPE_UNSPEC,
/* Reserve 0 as invalid map type */
BPF_MAP_TYPE_HASH,
BPF_MAP_TYPE_ARRAY,
BPF_MAP_TYPE_PROG_ARRAY,
};
BPF_MAP_TYPE_HASH
Hash-table maps have the following characteristics:
* Maps are created and destroyed by user-space programs. Both user-space and eBPF programs can perform lookup, update, and delete operations.
* The kernel takes care of allocating and freeing key/value pairs.
* The map_update_elem() helper with fail to insert new element when the max_entries limit is reached. (This ensures that eBPF programs cannot exhaust memory.)
* map_update_elem() replaces existing elements atomically.
Hash-table maps are optimized for speed of lookup.
This function like the hash-table type above except it indexes the entries like an array, meaning for a map with n elements you can only use indexes 0 to n-1.
With it’s release in August 2015, version 4.2 of the Linux kernel added the eBPF-map type BPF_MAP_TYPE_PROG_ARRAY. This is one of the more interesting eBPF-map types because it allows tail calling of eBPF programs! And, as you may have guessed, the BPF_MAP_TYPE_PROG_ARRAY holds file descriptors of loaded eBPF programs as its values. The man page states that, as of this writing, both the key and value must be 4 bytes in size. Thus the common thing to do is use numbers to identify the different eBPF program types. With this pattern also comes the bpf_tail_call helper function. This function can be invoked by an eBPF program to lookup a program from an eBPF-map of type BPF_MAP_TYPE_PROG_ARRAY with a given key and then jump into that function
With version 4.4 of the Linux kernel, released in January 2016, eBPF was integrated into the perf tooling system. For those unfamiliar with it, perf is a tool in Linux that can be used for a wide swath of performance monitoring including CPU performance counters, tracepoints, kprobes, and uprobes (dynamic tracing).
The usage of BPF_MAP_TYPE_PERF_EVENT_ARRAY isn’t super clear as there appear to be only two of examples of directly using it. Both of these can be found in the following locations within the Linux kernel repository.
samples/bpf/tracex6_*.c: These two files (tracex6_kern.c and tracex6_user.c) form a simplistic example. However, due to lack of comments or documentation, understanding its function isn’t obvious. The commit message for the example’s creation, done by Kaixu Xia, states the example “shows how to use the new ability to get the selected Hardware PMU counter value”.
samples/bpf/trace_output_*.c: These two files (trace_output_kern.c and trace_output_user.c) form the more complex example. This one also lacks code documentation. The commit message for this example’s creation, done by Alexei Starovoitov, states that a “kprobe is attached to sys_write() and trivial bpf program streams pid+cookie into userspace via PERF_COUNT_SW_BPF_OUTPUT event”.
With its release in May 2016, version 4.6 of the Linux kernel added the eBPF-map types BPF_MAP_TYPE_PERCPU_HASH and BPF_MAP_TYPE_PERCPU_ARRAY. These two are nearly identical to BPF_MAP_TYPE_HASH and BPF_MAP_TYPE_ARRAY except that one is created for each CPU core. This allows for lock free uses of hash-tables and arrays in eBPF for high performance needs. Though, of course, it must be an application where the divided results can be reconciled in the end.
There are a few, minor, technical details about the per-cpu eBPF-map types. For those interested in the details of them check out the commits for each below:
BPF_MAP_TYPE_PERCPU_HASH, initial commit 824bd0ce6c7c43a9e1e210abf124958e54d88342
BPF_MAP_TYPE_PERCPU_ARRAY, initial commit a10423b87a7eae75da79ce80a8d9475047a674ee
With its release in May 2016, version 4.6 of the Linux kernel added the eBPF-map types BPF_MAP_TYPE_PERCPU_HASH and BPF_MAP_TYPE_PERCPU_ARRAY. These two are nearly identical to BPF_MAP_TYPE_HASH and BPF_MAP_TYPE_ARRAY except that one is created for each CPU core. This allows for lock free uses of hash-tables and arrays in eBPF for high performance needs. Though, of course, it must be an application where the divided results can be reconciled in the end.
There are a few, minor, technical details about the per-cpu eBPF-map types. For those interested in the details of them check out the commits for each below:
BPF_MAP_TYPE_PERCPU_HASH, initial commit 824bd0ce6c7c43a9e1e210abf124958e54d88342
BPF_MAP_TYPE_PERCPU_ARRAY, initial commit a10423b87a7eae75da79ce80a8d9475047a674ee