From 51e99f9880052c051f8774f63af5073d82503c7d Mon Sep 17 00:00:00 2001 From: Dmitrii Kuvaiskii Date: Mon, 22 Jan 2024 01:27:15 -0800 Subject: [PATCH] [LibOS] Add secure implementation of eventfd The previous implementation of eventfd was insecure and disabled by default (enabled via `sys.insecure__allow_eventfd` manifest option). The new implementation of eventfd is secure but currently restricted to a single process: eventfd objects created in parent process become invalid in children. However, sometimes it is acceptable for applications to use host-based insecure eventfd implementation. Because of these cases, we keep the old ``sys.insecure__allow_eventfd`` manifest syntax. LibOS regression tests are added to test the secure implementation of eventfd. Three LTP tests are enabled. Memcached and Rust examples also use the secure eventfd now. Signed-off-by: Dmitrii Kuvaiskii --- .../memcached/memcached.manifest.template | 4 - CI-Examples/nginx/Makefile | 6 +- .../rust-hyper-http-server.manifest.template | 6 - Documentation/devel/features.md | 34 ++- Documentation/manifest-syntax.rst | 21 +- libos/include/libos_fs.h | 4 + libos/include/libos_handle.h | 4 + libos/src/fs/eventfd/fs.c | 238 ++++++++++++++++-- libos/src/sys/libos_epoll.c | 8 +- libos/src/sys/libos_eventfd.c | 68 +++-- libos/src/sys/libos_poll.c | 6 + libos/test/ltp/ltp.cfg | 4 - ...tfd_fork_allowed_failing.manifest.template | 26 ++ libos/test/regression/eventfd_races.c | 222 ++++++++++++++++ libos/test/regression/manifest.template | 3 - libos/test/regression/meson.build | 1 + libos/test/regression/test_libos.py | 14 ++ libos/test/regression/tests.toml | 2 + libos/test/regression/tests_musl.toml | 2 + 19 files changed, 602 insertions(+), 71 deletions(-) create mode 100644 libos/test/regression/eventfd_fork_allowed_failing.manifest.template create mode 100644 libos/test/regression/eventfd_races.c diff --git a/CI-Examples/memcached/memcached.manifest.template b/CI-Examples/memcached/memcached.manifest.template index 8130ac1e55..41491d8b99 100644 --- a/CI-Examples/memcached/memcached.manifest.template +++ b/CI-Examples/memcached/memcached.manifest.template @@ -18,10 +18,6 @@ loader.env.LD_LIBRARY_PATH = "/lib:/usr/{{ arch_libdir }}" loader.uid = 1000 loader.gid = 1000 -# Memcached requires `eventfd` for worker thread notifications, starting from v1.6.11. If you have a -# Memcached version older than that, we recommend to remove this insecure manifest option. -sys.insecure__allow_eventfd = true - sys.enable_sigterm_injection = true fs.mounts = [ diff --git a/CI-Examples/nginx/Makefile b/CI-Examples/nginx/Makefile index d5fc23758c..4a87a0c4a5 100644 --- a/CI-Examples/nginx/Makefile +++ b/CI-Examples/nginx/Makefile @@ -29,8 +29,10 @@ ifeq ($(SGX),1) all: nginx.manifest.sgx nginx.sig endif -# Note that Gramine doesn't support eventfd() and PR_SET_DUMPABLE, so we manually -# overwrite these macros in the autogenerated configuration header of Nginx. +# Note that Gramine doesn't support PR_SET_DUMPABLE, so we manually overwrite this macro in the +# autogenerated configuration header of Nginx. Additionally, we disable support for eventfd, so that +# Nginx falls back to alternative Inter Process Communication (IPC) means; recall that Gramine's +# multi-process support for eventfd is insecure, thus we prefer not to use it here. $(INSTALL_DIR)/sbin/nginx: $(NGINX_SRC)/configure cd $(NGINX_SRC) && ./configure --prefix=$(abspath $(INSTALL_DIR)) \ --without-http_rewrite_module --with-http_ssl_module diff --git a/CI-Examples/rust/rust-hyper-http-server.manifest.template b/CI-Examples/rust/rust-hyper-http-server.manifest.template index ae9f2a4530..bb011b96e7 100644 --- a/CI-Examples/rust/rust-hyper-http-server.manifest.template +++ b/CI-Examples/rust/rust-hyper-http-server.manifest.template @@ -30,12 +30,6 @@ sgx.trusted_files = [ "file:{{ arch_libdir }}/", ] -# The Tokio runtime requires eventfd, and the Gramine implementation -# currently relies on the host in an insecure manner. This setting isn't -# suitable for production deployment, but works well as a stopgap during -# development while a proper implementation in Gramine is being worked on. -sys.insecure__allow_eventfd = true - # The maximum number of threads in a single process needs to be declared in advance. # You need to account for: # - one main thread diff --git a/Documentation/devel/features.md b/Documentation/devel/features.md index 3afbb60267..2266d6353c 100644 --- a/Documentation/devel/features.md +++ b/Documentation/devel/features.md @@ -2657,21 +2657,35 @@ Gramine and is supported. ### Event notifications (eventfd) -Gramine currently implements an *insecure* version of the `eventfd()` system call. It is considered -insecure in the context of SGX backend because it relies on the host OS, which could for example -maliciously drop an event or inject a random one. To enable this `eventfd()` implementation, the -manifest file must contain `sys.insecure__allow_eventfd = true` ({ref}`see manifest documentation -`). +There are two modes of eventfd: -Gramine supports polling on eventfd via `poll()`, `ppoll()`, `select()`, `epoll_*()` system calls. +1. Secure "emulate-in-Gramine" -- the eventfd object is created inside Gramine, and all operations + are resolved entirely inside Gramine. A dummy eventfd object is created on the host, purely to + trigger read/write notifications (e.g., in epoll); eventfd values are verified inside Gramine and + are never exposed to the host. Since the host is used purely for notifications, a malicious host + can only induce Denial of Service (DoS) attacks; thus this implementation is secure and enabled + by default. This implementation is automatically disabled if `sys.insecure__allow_eventfd` + {ref}`manifest option ` is enabled. -Gramine may implement a secure version of `eventfd()` for communication between Gramine processes in -the future. Such secure version will *not* be able to receive events from the host OS. + The emulation is currently implemented at the level of a single process. The emulation *may* work + for multi-process applications, e.g., if the child process inherits the eventfd object but + doesn't use it. However, all eventfds created in the parent process are marked as invalid in + child processes, i.e. inter-process communication via eventfds is not allowed. + + Note that this secure version is *not* able to receive events from the host OS. + +2. Insecure "passthrough-to-host" -- the eventfd object is created on the host, and all operations + are delegated to the host. Since this implementation is insecure, it is disallowed by default. To + use this implementation, it must be explicitly allowed via the `sys.insecure__allow_eventfd` + {ref}`manifest option `. + +Gramine supports polling on eventfd via `poll()`, `ppoll()`, `select()`, `epoll_*()` system calls, +in both secure and insecure modes.
Related system calls -- ▣ `eventfd()`: insecure implementation -- ▣ `eventfd2()`: insecure implementation +- ☑ `eventfd()`: see notes above +- ☑ `eventfd2()`: see notes above - ☑ `close()` - ☑ `read()` diff --git a/Documentation/manifest-syntax.rst b/Documentation/manifest-syntax.rst index 84bc043712..9e702f84b7 100644 --- a/Documentation/manifest-syntax.rst +++ b/Documentation/manifest-syntax.rst @@ -327,17 +327,28 @@ a 1 |~| MiB brk size. .. _allowing-eventfd: -Allowing eventfd -^^^^^^^^^^^^^^^^ +Allowing host-based insecure eventfd +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ :: sys.insecure__allow_eventfd = [true|false] (Default: false) -This specifies whether to allow system calls `eventfd()` and `eventfd2()`. Since -eventfd emulation currently relies on the host, these system calls are -disallowed by default due to security concerns. +By default, Gramine implements eventfd in a secure but restricted way: currently +this secure implementation only works when eventfd usage is confined to a single +process (note that application may still be multi-process and spawn child +processes, but eventfds created in parent will be invalid in children). + +However, sometimes it is acceptable for applications to use host-based insecure +eventfd implementation. This implementation works without the above-mentioned +restriction in multi-process applications. Use ``sys.insecure__allow_eventfd`` +manifest syntax to switch to this insecure implementation. + +.. note :: + ``sys.insecure__allow_eventfd`` is pass-through and thus potentially insecure + in e.g. SGX environments. It is the responsibility of the app developer to + analyze the app usage of eventfd, with security implications in mind. .. _external-sigterm-injection: diff --git a/libos/include/libos_fs.h b/libos/include/libos_fs.h index cbee338138..116d17c628 100644 --- a/libos/include/libos_fs.h +++ b/libos/include/libos_fs.h @@ -182,6 +182,10 @@ struct libos_fs_ops { /* Poll a single handle. Must not block. */ int (*poll)(struct libos_handle* hdl, int in_events, int* out_events); + /* Verify a single handle after poll. Must update `pal_ret_events` in-place with only allowed + * ones. Used in e.g. secure eventfd FS to verify if the host is not lying to us. */ + void (*post_poll)(struct libos_handle* hdl, pal_wait_flags_t* pal_ret_events); + /* checkpoint/migrate the file system */ ssize_t (*checkpoint)(void** checkpoint, void* mount_data); int (*migrate)(void* checkpoint, void** mount_data); diff --git a/libos/include/libos_handle.h b/libos/include/libos_handle.h index 81c28c91d8..5bafa23680 100644 --- a/libos/include/libos_handle.h +++ b/libos/include/libos_handle.h @@ -135,7 +135,11 @@ struct libos_epoll_handle { }; struct libos_eventfd_handle { + bool broken_in_child; bool is_semaphore; + spinlock_t lock; /* protects below fields */ + uint64_t val; + uint64_t dummy_host_val; }; struct libos_handle { diff --git a/libos/src/fs/eventfd/fs.c b/libos/src/fs/eventfd/fs.c index abb01b74d7..5f32aa5819 100644 --- a/libos/src/fs/eventfd/fs.c +++ b/libos/src/fs/eventfd/fs.c @@ -1,8 +1,9 @@ /* SPDX-License-Identifier: LGPL-3.0-or-later */ -/* Copyright (C) 2019 Intel Corporation */ +/* Copyright (C) 2024 Intel Corporation */ /* - * This file contains code for passthrough-to-host implementation of 'eventfd' filesystem. + * This file contains code for passthrough-to-host and emulate-in-libos implementations of 'eventfd' + * filesystem. For more information on the modes, see `libos_eventfd.c`. */ #include "libos_fs.h" @@ -12,44 +13,243 @@ #include "linux_abi/errors.h" #include "pal.h" +/* In emulate-in-libos mode, enforce a restriction: all eventfds created in the parent process are + * marked as invalid in child processes, i.e. inter-process communication via eventfds is not + * allowed. This restriction is because LibOS doesn't yet implement sync between eventfd objects. */ +static int eventfd_checkin(struct libos_handle* hdl) { + assert(hdl->type == TYPE_EVENTFD); + if (!g_eventfd_passthrough_mode) + hdl->info.eventfd.broken_in_child = true; + return 0; +} + +static void eventfd_dummy_host_read(struct libos_handle* hdl) { + int ret; + uint64_t buf_dummy_host_val = 0; + size_t dummy_host_val_count = sizeof(buf_dummy_host_val); + do { + ret = PalStreamRead(hdl->pal_handle, /*offset=*/0, &dummy_host_val_count, + &buf_dummy_host_val); + } while (ret == -PAL_ERROR_INTERRUPTED); + if (ret < 0 || dummy_host_val_count != sizeof(buf_dummy_host_val)) { + /* must not happen in benign case, consider it an attack and panic */ + BUG(); + } +} + +static void eventfd_dummy_host_write(struct libos_handle* hdl) { + int ret; + uint64_t buf_dummy_host_val = 1; + size_t dummy_host_val_count = sizeof(buf_dummy_host_val); + do { + ret = PalStreamWrite(hdl->pal_handle, /*offset=*/0, &dummy_host_val_count, + &buf_dummy_host_val); + } while (ret == -PAL_ERROR_INTERRUPTED); + if (ret < 0 || dummy_host_val_count != sizeof(buf_dummy_host_val)) { + /* must not happen in benign case, consider it an attack and panic */ + BUG(); + } +} + +static void eventfd_dummy_host_wait(struct libos_handle* hdl) { + pal_wait_flags_t wait_for_events = PAL_WAIT_READ; + pal_wait_flags_t ret_events = 0; + int ret = PalStreamsWaitEvents(1, &hdl->pal_handle, &wait_for_events, &ret_events, NULL); + if (ret < 0 && ret != -PAL_ERROR_INTERRUPTED) { + BUG(); + } + (void)ret_events; /* we don't care what events the host returned, we can't trust them anyway */ +} + static ssize_t eventfd_read(struct libos_handle* hdl, void* buf, size_t count, file_off_t* pos) { __UNUSED(pos); if (count < sizeof(uint64_t)) return -EINVAL; - int ret = PalStreamRead(hdl->pal_handle, /*offset=*/0, &count, buf); - if (!ret && count != sizeof(uint64_t)) { - /* successful read must return 8 bytes, otherwise it's an attack or host malfunction */ - return -EPERM; + if (g_eventfd_passthrough_mode) { + /* passthrough-to-host mode */ + int ret = PalStreamRead(hdl->pal_handle, /*offset=*/0, &count, buf); + if (!ret && count != sizeof(uint64_t)) { + /* successful read must return 8 bytes, otherwise it's an attack or host malfunction */ + return -EPERM; + } + ret = pal_to_unix_errno(ret); + /* eventfd objects never perform partial reads, see also check above */ + maybe_epoll_et_trigger(hdl, ret, /*in=*/true, /*unused was_partial=*/false); + return ret < 0 ? ret : (ssize_t)count; } - ret = pal_to_unix_errno(ret); - /* eventfd objects never perform partial reads, see also check above */ + + /* emulate-in-libos mode */ + if (hdl->info.eventfd.broken_in_child) { + log_warning("Child process tried to access eventfd created by parent process. This is " + "disallowed in Gramine (but see `sys.insecure__allow_eventfd`)."); + return -EIO; + } + + int ret; + spinlock_lock(&hdl->info.eventfd.lock); + + while (!hdl->info.eventfd.val) { + if (hdl->flags & O_NONBLOCK) { + ret = -EAGAIN; + goto out; + } + spinlock_unlock(&hdl->info.eventfd.lock); + eventfd_dummy_host_wait(hdl); + spinlock_lock(&hdl->info.eventfd.lock); + } + + if (!hdl->info.eventfd.is_semaphore) { + memcpy(buf, &hdl->info.eventfd.val, sizeof(uint64_t)); + hdl->info.eventfd.val = 0; + } else { + uint64_t one_val = 1; + memcpy(buf, &one_val, sizeof(uint64_t)); + hdl->info.eventfd.val--; + } + + /* perform a read (not supposed to block) to clear the event from polling threads and to send an + * event to writing threads */ + if (hdl->info.eventfd.dummy_host_val) { + eventfd_dummy_host_read(hdl); + hdl->info.eventfd.dummy_host_val = 0; + } + + ret = (ssize_t)count; +out: + spinlock_unlock(&hdl->info.eventfd.lock); maybe_epoll_et_trigger(hdl, ret, /*in=*/true, /*unused was_partial=*/false); - return ret < 0 ? ret : (ssize_t)count; + return ret; } static ssize_t eventfd_write(struct libos_handle* hdl, const void* buf, size_t count, file_off_t* pos) { __UNUSED(pos); - if (count < sizeof(uint64_t)) + /* note: before v6.9, Linux kernel had a `<` comparison; here we adhere to the latest Linux + * version (v6.9) that switched to `!=` */ + if (count != sizeof(uint64_t)) return -EINVAL; - int ret = PalStreamWrite(hdl->pal_handle, /*offset=*/0, &count, (void*)buf); - if (!ret && count != sizeof(uint64_t)) { - /* successful write must return 8 bytes, otherwise it's an attack or host malfunction */ - return -EPERM; + uint64_t buf_val; + memcpy(&buf_val, buf, sizeof(uint64_t)); + if (buf_val == UINT64_MAX) + return -EINVAL; + + if (buf_val == 0) { + /* Note: we explicitly emulate eventfd write-of-zero as a no-op. Linux kernel ultimately + * also no-ops on write-of-zero despite the lack of an explicit early-return condition + * (https://elixir.bootlin.com/linux/v6.8/source/fs/eventfd.c#L247). This explicit check is + * required in Gramine due to how our polling works. */ + return (ssize_t)count; + } + + if (g_eventfd_passthrough_mode) { + /* passthrough-to-host mode */ + int ret = PalStreamWrite(hdl->pal_handle, /*offset=*/0, &count, (void*)buf); + if (!ret && count != sizeof(uint64_t)) { + /* successful write must return 8 bytes, otherwise it's an attack or host malfunction */ + return -EPERM; + } + ret = pal_to_unix_errno(ret); + /* eventfd objects never perform partial writes, see also check above */ + maybe_epoll_et_trigger(hdl, ret, /*in=*/false, /*unused was_partial=*/false); + return ret < 0 ? ret : (ssize_t)count; + } + + /* emulate-in-libos mode */ + if (hdl->info.eventfd.broken_in_child) { + log_warning("Child process tried to access eventfd created by parent process. This is " + "disallowed in Gramine (but see `sys.insecure__allow_eventfd`)."); + return -EIO; } - ret = pal_to_unix_errno(ret); - /* eventfd objects never perform partial writes, see also check above */ + + int ret; + spinlock_lock(&hdl->info.eventfd.lock); + + uint64_t val; + while (__builtin_add_overflow(hdl->info.eventfd.val, buf_val, &val) || val > UINT64_MAX - 1) { + if (hdl->flags & O_NONBLOCK) { + ret = -EAGAIN; + goto out; + } + spinlock_unlock(&hdl->info.eventfd.lock); + /* + * For the corner case of write overflowing the counter (and thus blocking), we choose to + * emulate blocking via busy loop. This may burn cycles and starve other threads. + * Additionally, host polls on eventfd write events will *always* report that eventfd is + * available for writing, even if it is actually supposed to be blocked; our emulation + * removes the "available for writing" return event (see `eventfd_post_poll()`) so the user + * app sees that the poll returned but doesn't see the write event (i.e. spurious return). + * Overflowing eventfd writes are considered sufficiently rare, so our busy-loop emulation + * should not hamper performance of normal apps. + * + * An alternative would be to (1) use a separate synchronization mechanism on which the + * writer would block, or to (2) keep the host value in sync with the LibOS value. The + * former alternative would introduce complexity, whereas the latter alternative would + * violate confidentiality (by leaking eventfd counter). + */ + PalThreadYieldExecution(); + spinlock_lock(&hdl->info.eventfd.lock); + } + + hdl->info.eventfd.val = val; + + /* perform a write (not supposed to block) to send an event to reading/polling threads */ + if (hdl->info.eventfd.dummy_host_val >= UINT64_MAX - 1) + BUG(); + hdl->info.eventfd.dummy_host_val++; + eventfd_dummy_host_write(hdl); + + ret = (ssize_t)count; +out: + spinlock_unlock(&hdl->info.eventfd.lock); maybe_epoll_et_trigger(hdl, ret, /*in=*/false, /*unused was_partial=*/false); - return ret < 0 ? ret : (ssize_t)count; + return ret; +} + +static void eventfd_post_poll(struct libos_handle* hdl, pal_wait_flags_t* pal_ret_events) { + if (g_eventfd_passthrough_mode) + return; + + if (hdl->info.eventfd.broken_in_child) { + log_warning("Child process tried to access eventfd created by parent process. This is " + "disallowed in Gramine (but see `sys.insecure__allow_eventfd`)."); + *pal_ret_events = PAL_WAIT_ERROR; + return; + } + + if (*pal_ret_events & (PAL_WAIT_ERROR | PAL_WAIT_HANG_UP)) { + /* impossible: we control eventfd inside the LibOS, and we never raise such conditions */ + BUG(); + } + + spinlock_lock(&hdl->info.eventfd.lock); + if (*pal_ret_events & PAL_WAIT_READ) { + /* there is data to read: verify if counter has value greater than zero */ + if (!hdl->info.eventfd.val) { + /* spurious or malicious notification, can legitimately happen if another thread + * consumed this event between this thread's poll wakeup and the post_poll callback; + * we currently choose to return a spurious notification to the user */ + *pal_ret_events &= ~PAL_WAIT_READ; + } + } + if (*pal_ret_events & PAL_WAIT_WRITE) { + /* verify it's really possible to write a value of at least "1" without blocking */ + if (hdl->info.eventfd.val >= UINT64_MAX - 1) { + /* spurious or malicious notification, see comment above */ + *pal_ret_events &= ~PAL_WAIT_WRITE; + } + } + spinlock_unlock(&hdl->info.eventfd.lock); } struct libos_fs_ops eventfd_fs_ops = { - .read = &eventfd_read, - .write = &eventfd_write, + .checkin = &eventfd_checkin, + .read = &eventfd_read, + .write = &eventfd_write, + .post_poll = &eventfd_post_poll, }; struct libos_fs eventfd_builtin_fs = { diff --git a/libos/src/sys/libos_epoll.c b/libos/src/sys/libos_epoll.c index fab6965eee..06c78f5559 100644 --- a/libos/src/sys/libos_epoll.c +++ b/libos/src/sys/libos_epoll.c @@ -181,9 +181,9 @@ void maybe_epoll_et_trigger(struct libos_handle* handle, int ret, bool in, bool * Some workloads (e.g. rust's tokio crate) use eventfd with EPOLLET in a peculiar * way: each write to that eventfd increases counter by 1 and thanks to EPOLLET is * reported by epoll only once, even if there is no read from eventfd. + * * To handle such usage pattern, we mark eventfd as read-epollet-pollable on each * write - we assume that eventfd is not shared between processes. - * Hopefully no app tries to increase the eventfd counter by 0... */ __atomic_store_n(&handle->needs_et_poll_in, true, __ATOMIC_RELEASE); needs_et = true; @@ -658,6 +658,11 @@ static int do_epoll_wait(int epfd, struct epoll_event* events, int maxevents, in continue; } + if (items[i]->handle->fs && items[i]->handle->fs->fs_ops + && items[i]->handle->fs->fs_ops->post_poll) { + items[i]->handle->fs->fs_ops->post_poll(items[i]->handle, &pal_ret_events[i]); + } + uint32_t this_item_events = 0; if (pal_ret_events[i] & PAL_WAIT_ERROR) { this_item_events |= EPOLLERR; @@ -674,6 +679,7 @@ static int do_epoll_wait(int epfd, struct epoll_event* events, int maxevents, in this_item_events |= items[i]->events & (EPOLLOUT | EPOLLWRNORM); } + /* TODO: move this to socket FS's post_poll() callback */ if (items[i]->handle->type == TYPE_SOCK && (pal_ret_events[i] & (PAL_WAIT_READ | PAL_WAIT_WRITE))) { bool error_event = !!(pal_ret_events[i] & (PAL_WAIT_ERROR | PAL_WAIT_HANG_UP)); diff --git a/libos/src/sys/libos_eventfd.c b/libos/src/sys/libos_eventfd.c index 8c9ca027b8..f4435460c8 100644 --- a/libos/src/sys/libos_eventfd.c +++ b/libos/src/sys/libos_eventfd.c @@ -1,23 +1,57 @@ /* SPDX-License-Identifier: LGPL-3.0-or-later */ -/* Copyright (C) 2019 Intel Corporation */ +/* Copyright (C) 2024 Intel Corporation */ /* - * Implementation of system calls "eventfd" and "eventfd2". Since eventfd emulation currently relies - * on the host, these system calls are disallowed by default due to security concerns. To use them, - * they must be explicitly allowed through the "sys.insecure__allow_eventfd" manifest key. + * Implementation of system calls "eventfd" and "eventfd2". + * + * There are two modes of eventfd: + * + * 1. Passthrough-to-host -- the eventfd object is created on the host, and all operations are + * delegated to the host. Since this implementation is insecure, it is disallowed by default. To + * use this implementation, it must be explicitly allowed via the `sys.insecure__allow_eventfd` + * manifest option. + * + * 2. Emulate-in-libos -- the eventfd object is created inside the LibOS, and all operations are + * resolved entirely inside the LibOS. A dummy eventfd object is created on the host, purely to + * trigger read/write notifications (e.g., in epoll); eventfd values are verified inside the + * LibOS and are never exposed to the host. Since the host is used purely for notifications (see + * notes below), this implementation is considered secure and enabled by default. It is + * automatically disabled if the manifest option `sys.insecure__allow_eventfd` is enabled. + * + * - The emulation is currently implemented at the level of a single process. The emulation *may* + * work for multi-process applications, e.g., if the child process inherits the eventfd object + * but doesn't use it. However, all eventfds created in the parent process are marked as + * invalid in child processes, i.e. inter-process communication via eventfds is not allowed. + * + * - The host's eventfd object is "dummy" and used purely for notifications -- to unblock + * blocking read/write/select/poll/epoll system calls. The read/write notify logic is already + * hardened, by double-checking that the object was indeed updated. However, there are three + * possible attacks on polling mechanisms (select/poll/epoll): + * + * a. Malicious host may inject the notification too early: POLLIN when nothing was written + * yet or POLLOUT when nothing was read yet. This may lead to a synchronization failure + * of the app: e.g., Rust Tokio's Metal I/O (mio) lib would incorrectly wake up a thread. + * To prevent this, eventfd implements a callback `post_poll()` where it verifies that some + * data was indeed written/read (i.e., that the notification is not spurious). + * b. Malicious host may inject the notification too late or not send a notification at all. + * This is a Denial of Service (DoS), which we don't care about. + * c. Malicious host may inject POLLERR, POLLHUP, POLLRDHUP, POLLNVAL. This is impossible as we + * control eventfd objects inside the LibOS, and we never raise such conditions. So the + * callback `post_poll()` panics if it detects such a return event. */ +#include "libos_checkpoint.h" #include "libos_fs.h" #include "libos_handle.h" #include "libos_internal.h" #include "libos_table.h" #include "libos_utils.h" -#include "linux_eventfd.h" #include "linux_abi/fs.h" +#include "linux_eventfd.h" #include "pal.h" #include "toml_utils.h" -bool g_eventfd_passthrough_mode = false; +bool g_eventfd_passthrough_mode __attribute_migratable = false; int init_eventfd_mode(void) { assert(g_manifest_root); @@ -38,14 +72,6 @@ static int create_eventfd_pal_handle(uint64_t initial_count, int flags, PAL_HANDLE* out_pal_handle) { int ret; - if (!g_eventfd_passthrough_mode) { - if (FIRST_TIME()) { - log_warning("The app tried to use eventfd, but it's turned off " - "(sys.insecure__allow_eventfd = false)"); - } - return -ENOSYS; - } - PAL_HANDLE hdl = NULL; int pal_flags = 0; @@ -84,12 +110,20 @@ long libos_syscall_eventfd2(unsigned int count, int flags) { hdl->type = TYPE_EVENTFD; hdl->fs = &eventfd_builtin_fs; - hdl->flags = O_RDWR; + hdl->flags = O_RDWR | (flags & EFD_NONBLOCK ? O_NONBLOCK : 0); hdl->acc_mode = MAY_READ | MAY_WRITE; hdl->info.eventfd.is_semaphore = !!(flags & EFD_SEMAPHORE); - - ret = create_eventfd_pal_handle(count, flags, &hdl->pal_handle); + hdl->info.eventfd.val = count; + hdl->info.eventfd.dummy_host_val = 0; + hdl->info.eventfd.broken_in_child = false; + + if (g_eventfd_passthrough_mode) { + ret = create_eventfd_pal_handle(hdl->info.eventfd.val, flags, &hdl->pal_handle); + } else { + ret = create_eventfd_pal_handle(hdl->info.eventfd.dummy_host_val, /*flags=*/0, + &hdl->pal_handle); + } if (ret < 0) goto out; diff --git a/libos/src/sys/libos_poll.c b/libos/src/sys/libos_poll.c index b1e59ccc96..842151c1be 100644 --- a/libos/src/sys/libos_poll.c +++ b/libos/src/sys/libos_poll.c @@ -203,6 +203,11 @@ static long do_poll(struct pollfd* fds, size_t fds_len, uint64_t* timeout_us) { continue; } + if (libos_handles[i]->fs && libos_handles[i]->fs->fs_ops + && libos_handles[i]->fs->fs_ops->post_poll) { + libos_handles[i]->fs->fs_ops->post_poll(libos_handles[i], &ret_events[i]); + } + fds[i].revents = 0; if (ret_events[i] & PAL_WAIT_ERROR) fds[i].revents |= POLLERR; @@ -216,6 +221,7 @@ static long do_poll(struct pollfd* fds, size_t fds_len, uint64_t* timeout_us) { if (ret_events[i] & PAL_WAIT_WRITE) fds[i].revents |= fds[i].events & (POLLOUT | POLLWRNORM); + /* TODO: move this to socket FS's post_poll() callback */ if (libos_handles[i]->type == TYPE_SOCK && (ret_events[i] & (PAL_WAIT_READ | PAL_WAIT_WRITE))) { bool error_event = !!(ret_events[i] & (PAL_WAIT_ERROR | PAL_WAIT_HANG_UP)); diff --git a/libos/test/ltp/ltp.cfg b/libos/test/ltp/ltp.cfg index e40899a1b9..07ba977792 100644 --- a/libos/test/ltp/ltp.cfg +++ b/libos/test/ltp/ltp.cfg @@ -264,10 +264,6 @@ skip = yes [eventfd01] skip = yes -# eventfd not allowed in manifest -[eventfd2_*] -skip = yes - # error while loading libc.so.6 [execle01] skip = yes diff --git a/libos/test/regression/eventfd_fork_allowed_failing.manifest.template b/libos/test/regression/eventfd_fork_allowed_failing.manifest.template new file mode 100644 index 0000000000..91fe5638c4 --- /dev/null +++ b/libos/test/regression/eventfd_fork_allowed_failing.manifest.template @@ -0,0 +1,26 @@ +{% set entrypoint = "eventfd_fork" -%} + +loader.entrypoint = "file:{{ gramine.libos }}" +libos.entrypoint = "{{ entrypoint }}" + +loader.log_level = "warning" # to print the warning about eventfd usage in child process + +loader.env.LD_LIBRARY_PATH = "/lib" + +fs.mounts = [ + { path = "/lib", uri = "file:{{ gramine.runtimedir(libc) }}" }, + { path = "/{{ entrypoint }}", uri = "file:{{ binary_dir }}/{{ entrypoint }}" }, +] + +# below is commented out to test the negative case: eventfd accessed in child process must fail +# because eventfd is in a secure single-process mode (due to commented-out line) +#sys.insecure__allow_eventfd = true + +sgx.debug = true +sgx.edmm_enable = {{ 'true' if env.get('EDMM', '0') == '1' else 'false' }} + +sgx.trusted_files = [ + "file:{{ gramine.libos }}", + "file:{{ gramine.runtimedir(libc) }}/", + "file:{{ binary_dir }}/{{ entrypoint }}", +] diff --git a/libos/test/regression/eventfd_races.c b/libos/test/regression/eventfd_races.c new file mode 100644 index 0000000000..78c3f5f62e --- /dev/null +++ b/libos/test/regression/eventfd_races.c @@ -0,0 +1,222 @@ +#include +#include +#include +#include +#include +#include +#include +#include + +#include "common.h" + +#define TEST_RUNS 10000 +#define TEST_VAL 42 + +static int g_efd; +static uint64_t g_read_events; /* atomic counter */ +static uint64_t g_write_events; /* atomic counter */ +static uint64_t g_total_events; /* atomic counter */ + +/* required to NOT increase the above counters on the "last read/write event to unblock the eventfd + * consumers"; this is purely to avoid these counters incrementing past TEST_RUNS */ +static bool g_stop_test; /* atomic boolean */ + +static void pthread_check(int x) { + if (x) { + printf("pthread failed with %d (%s)\n", x, strerror(x)); + exit(1); + } +} + +static void* write_eventfd_thread(void* arg) { + uint64_t val = TEST_VAL; + for (int i = 0; i < TEST_RUNS; i++) { + uint64_t curr_read_events = __atomic_load_n(&g_read_events, __ATOMIC_SEQ_CST); + if (write(g_efd, &val, sizeof(val)) != sizeof(val)) + errx(1, "eventfd write failed"); + while (__atomic_load_n(&g_read_events, __ATOMIC_SEQ_CST) == curr_read_events) + /* wait until some reader thread updates the read_events counter */; + } + /* send one last event to unblock the second reader */ + __atomic_store_n(&g_stop_test, true, __ATOMIC_SEQ_CST); + if (write(g_efd, &val, sizeof(val)) != sizeof(val)) + errx(1, "eventfd write failed"); + return NULL; +} + +static void* read_eventfd_thread(void* arg) { + uint64_t val; + uint64_t read_events_total = 0; + while (true) { + uint64_t curr_read_events = __atomic_load_n(&g_read_events, __ATOMIC_SEQ_CST); + if (curr_read_events == TEST_RUNS) + break; + if (read(g_efd, &val, sizeof(val)) != sizeof(val) || val != TEST_VAL) + errx(1, "eventfd read failed"); + if (__atomic_load_n(&g_stop_test, __ATOMIC_SEQ_CST)) + break; + __atomic_add_fetch(&g_read_events, 1, __ATOMIC_SEQ_CST); + read_events_total++; + } + printf("Read-only thread: read eventfd %lu times\n", read_events_total); + __atomic_add_fetch(&g_total_events, read_events_total, __ATOMIC_SEQ_CST); + return NULL; +} + +static void* poll_then_read_eventfd_thread(void* arg) { + struct pollfd pollfd = { .fd = g_efd, .events = POLLIN }; + + uint64_t val; + uint64_t poll_events_total = 0; + uint64_t read_events_total = 0; + while (true) { + uint64_t curr_read_events = __atomic_load_n(&g_read_events, __ATOMIC_SEQ_CST); + if (curr_read_events == TEST_RUNS) + break; + int ret = CHECK(poll(&pollfd, /*nfds=*/1, /*timeout=*/-1)); + poll_events_total++; + if (!ret) + continue; + if (ret != 1) + errx(1, "poll on eventfd returned more than 1 event (%d)", ret); + if (!(pollfd.revents & POLLIN)) + continue; + /* below read may block because of the race: another thread can read first and reset the + * event value; we don't care as this is benign */ + if (read(g_efd, &val, sizeof(val)) != sizeof(val) || val != TEST_VAL) + errx(1, "eventfd read failed"); + if (__atomic_load_n(&g_stop_test, __ATOMIC_SEQ_CST)) + break; + __atomic_add_fetch(&g_read_events, 1, __ATOMIC_SEQ_CST); + read_events_total++; + } + printf("Poll-then-read thread: polled eventfd %lu times\n", poll_events_total); + printf("Poll-then-read thread: read eventfd %lu times\n", read_events_total); + __atomic_add_fetch(&g_total_events, read_events_total, __ATOMIC_SEQ_CST); + return NULL; +} + +static void* blocking_write_eventfd_thread(void* arg) { + uint64_t val = 1; /* must be `1` because of semaphore semantics -- reader decrements by 1 */ + uint64_t write_events_total = 0; + while (true) { + uint64_t curr_write_events = __atomic_load_n(&g_write_events, __ATOMIC_SEQ_CST); + if (curr_write_events == TEST_RUNS) + break; + if (write(g_efd, &val, sizeof(val)) != sizeof(val)) + errx(1, "eventfd write failed"); + if (__atomic_load_n(&g_stop_test, __ATOMIC_SEQ_CST)) + break; + __atomic_add_fetch(&g_write_events, 1, __ATOMIC_SEQ_CST); + write_events_total++; + } + printf("Blocking write thread: wrote eventfd %lu times\n", write_events_total); + __atomic_add_fetch(&g_total_events, write_events_total, __ATOMIC_SEQ_CST); + return NULL; +} + +static void* read_for_blocking_write_eventfd_thread(void* arg) { + uint64_t val; + for (int i = 0; i < TEST_RUNS; i++) { + uint64_t curr_write_events = __atomic_load_n(&g_write_events, __ATOMIC_SEQ_CST); + if (read(g_efd, &val, sizeof(val)) != sizeof(val) || val != 1) + errx(1, "eventfd read failed"); + while (__atomic_load_n(&g_write_events, __ATOMIC_SEQ_CST) == curr_write_events) + /* wait until some writer thread updates the write_events counter */; + } + /* get one last event to unblock the second writer */ + __atomic_store_n(&g_stop_test, true, __ATOMIC_SEQ_CST); + if (read(g_efd, &val, sizeof(val)) != sizeof(val)) + errx(1, "eventfd read failed"); + return NULL; +} + +static void eventfd_two_readers_doing_two_reads(void) { + g_read_events = g_total_events = 0; + g_stop_test = false; + g_efd = CHECK(eventfd(0, 0)); /* a blocking non-semaphore eventfd */ + + pthread_t th[3]; /* two readers (both do blocking reads) and one writer */ + pthread_check(pthread_create(&th[0], NULL, write_eventfd_thread, NULL)); + pthread_check(pthread_create(&th[1], NULL, read_eventfd_thread, NULL)); + pthread_check(pthread_create(&th[2], NULL, read_eventfd_thread, NULL)); + + for (int i = 0; i < 3; i++) { + pthread_check(pthread_join(th[i], NULL)); + } + CHECK(close(g_efd)); + if (g_total_events != TEST_RUNS) + errx(1, "total events for subtest is %lu (expected %d)", g_total_events, TEST_RUNS); +} + +static void eventfd_two_readers_doing_read_and_poll(void) { + g_read_events = g_total_events = 0; + g_stop_test = false; + g_efd = CHECK(eventfd(0, 0)); /* a blocking non-semaphore eventfd */ + + pthread_t th[3]; /* two readers (one does blocking read, one does poll) and one writer */ + pthread_check(pthread_create(&th[0], NULL, write_eventfd_thread, NULL)); + pthread_check(pthread_create(&th[1], NULL, read_eventfd_thread, NULL)); + pthread_check(pthread_create(&th[2], NULL, poll_then_read_eventfd_thread, NULL)); + + for (int i = 0; i < 3; i++) { + pthread_check(pthread_join(th[i], NULL)); + } + CHECK(close(g_efd)); + if (g_total_events != TEST_RUNS) + errx(1, "total events for subtest is %lu (expected %d)", g_total_events, TEST_RUNS); +} + +static void eventfd_two_readers_doing_two_polls(void) { + g_read_events = g_total_events = 0; + g_stop_test = false; + g_efd = CHECK(eventfd(0, 0)); /* a blocking non-semaphore eventfd */ + + pthread_t th[3]; /* two readers (both do poll) and one writer */ + pthread_check(pthread_create(&th[0], NULL, write_eventfd_thread, NULL)); + pthread_check(pthread_create(&th[1], NULL, poll_then_read_eventfd_thread, NULL)); + pthread_check(pthread_create(&th[2], NULL, poll_then_read_eventfd_thread, NULL)); + + for (int i = 0; i < 3; i++) { + pthread_check(pthread_join(th[i], NULL)); + } + CHECK(close(g_efd)); + if (g_total_events != TEST_RUNS) + errx(1, "total events for subtest is %lu (expected %d)", g_total_events, TEST_RUNS); +} + +static void eventfd_two_writers(void) { + g_write_events = g_total_events = 0; + g_stop_test = false; + /* a blocking semaphore eventfd (we don't want reads to reset value, so that writes block) */ + g_efd = CHECK(eventfd(0, EFD_SEMAPHORE)); + uint64_t val = UINT64_MAX - 1; + if (write(g_efd, &val, sizeof(val)) != sizeof(val)) + errx(1, "initial eventfd write failed"); + + pthread_t th[3]; /* two writers (both do blocking writes) and one reader */ + pthread_check(pthread_create(&th[0], NULL, blocking_write_eventfd_thread, NULL)); + pthread_check(pthread_create(&th[1], NULL, blocking_write_eventfd_thread, NULL)); + pthread_check(pthread_create(&th[2], NULL, read_for_blocking_write_eventfd_thread, NULL)); + + for (int i = 0; i < 3; i++) { + pthread_check(pthread_join(th[i], NULL)); + } + CHECK(close(g_efd)); + if (g_total_events != TEST_RUNS) + errx(1, "total events for subtest is %lu (expected %d)", g_total_events, TEST_RUNS); +} + +int main(void) { + setbuf(stdout, NULL); + puts("------------------------"); + eventfd_two_readers_doing_two_reads(); + puts("------------------------"); + eventfd_two_readers_doing_read_and_poll(); + puts("------------------------"); + eventfd_two_readers_doing_two_polls(); + puts("------------------------"); + eventfd_two_writers(); + puts("TEST OK"); + return 0; +} diff --git a/libos/test/regression/manifest.template b/libos/test/regression/manifest.template index ae5f6cda89..692bfe1a5b 100644 --- a/libos/test/regression/manifest.template +++ b/libos/test/regression/manifest.template @@ -4,9 +4,6 @@ libos.entrypoint = "{{ entrypoint }}" loader.env.LD_LIBRARY_PATH = "/lib:{{ arch_libdir }}:/usr/{{ arch_libdir }}" loader.insecure__use_cmdline_argv = true -# for eventfd test -sys.insecure__allow_eventfd = true - # for flock_lock test sys.experimental__enable_flock = true diff --git a/libos/test/regression/meson.build b/libos/test/regression/meson.build index c5434a683a..48b9f54a05 100644 --- a/libos/test/regression/meson.build +++ b/libos/test/regression/meson.build @@ -21,6 +21,7 @@ tests = { 'epoll_test': {}, 'eventfd': {}, 'eventfd_fork': {}, + 'eventfd_races': {}, 'eventfd_read_then_write': {}, 'exec': {}, 'exec_fork': {}, diff --git a/libos/test/regression/test_libos.py b/libos/test/regression/test_libos.py index 4678322a7c..73590b8581 100644 --- a/libos/test/regression/test_libos.py +++ b/libos/test/regression/test_libos.py @@ -899,6 +899,20 @@ def test_072_eventfd_read_then_write(self): stdout, _ = self.run_binary(['eventfd_read_then_write']) self.assertIn('TEST OK', stdout) + def test_073_eventfd_fork_allowed_failing(self): + try: + self.run_binary(['eventfd_fork_allowed_failing']) + self.fail('eventfd_fork_allowed_failing unexpectedly succeeded') + except subprocess.CalledProcessError as e: + stdout = e.stdout.decode() + stderr = e.stderr.decode() + self.assertIn('Child process tried to access eventfd created by parent process', stderr) + self.assertIn('eventfd write failed', stdout) + + def test_074_eventfd_races(self): + stdout, _ = self.run_binary(['eventfd_races']) + self.assertIn('TEST OK', stdout) + @unittest.skipIf(USES_MUSL, 'sched_setscheduler is not supported in musl') def test_080_sched(self): stdout, _ = self.run_binary(['sched']) diff --git a/libos/test/regression/tests.toml b/libos/test/regression/tests.toml index 3dd6707315..9b68f74ee0 100644 --- a/libos/test/regression/tests.toml +++ b/libos/test/regression/tests.toml @@ -22,6 +22,8 @@ manifests = [ "epoll_test", "eventfd", "eventfd_fork", + "eventfd_fork_allowed_failing", + "eventfd_races", "eventfd_read_then_write", "exec", "exec_fork", diff --git a/libos/test/regression/tests_musl.toml b/libos/test/regression/tests_musl.toml index 07a9e8ca3a..b553efa678 100644 --- a/libos/test/regression/tests_musl.toml +++ b/libos/test/regression/tests_musl.toml @@ -24,6 +24,8 @@ manifests = [ "epoll_test", "eventfd", "eventfd_fork", + "eventfd_fork_allowed_failing", + "eventfd_races", "eventfd_read_then_write", "exec", "exec_fork",