From 51e99f9880052c051f8774f63af5073d82503c7d Mon Sep 17 00:00:00 2001
From: Dmitrii Kuvaiskii <dmitrii.kuvaiskii@intel.com>
Date: Mon, 22 Jan 2024 01:27:15 -0800
Subject: [PATCH] [LibOS] Add secure implementation of eventfd

The previous implementation of eventfd was insecure and disabled by
default (enabled via `sys.insecure__allow_eventfd` manifest option). The
new implementation of eventfd is secure but currently restricted to a
single process: eventfd objects created in parent process become invalid
in children.

However, sometimes it is acceptable for applications to use host-based
insecure eventfd implementation. Because of these cases, we keep the old
``sys.insecure__allow_eventfd`` manifest syntax.

LibOS regression tests are added to test the secure implementation of
eventfd. Three LTP tests are enabled. Memcached and Rust examples also
use the secure eventfd now.

Signed-off-by: Dmitrii Kuvaiskii <dmitrii.kuvaiskii@intel.com>
---
 .../memcached/memcached.manifest.template     |   4 -
 CI-Examples/nginx/Makefile                    |   6 +-
 .../rust-hyper-http-server.manifest.template  |   6 -
 Documentation/devel/features.md               |  34 ++-
 Documentation/manifest-syntax.rst             |  21 +-
 libos/include/libos_fs.h                      |   4 +
 libos/include/libos_handle.h                  |   4 +
 libos/src/fs/eventfd/fs.c                     | 238 ++++++++++++++++--
 libos/src/sys/libos_epoll.c                   |   8 +-
 libos/src/sys/libos_eventfd.c                 |  68 +++--
 libos/src/sys/libos_poll.c                    |   6 +
 libos/test/ltp/ltp.cfg                        |   4 -
 ...tfd_fork_allowed_failing.manifest.template |  26 ++
 libos/test/regression/eventfd_races.c         | 222 ++++++++++++++++
 libos/test/regression/manifest.template       |   3 -
 libos/test/regression/meson.build             |   1 +
 libos/test/regression/test_libos.py           |  14 ++
 libos/test/regression/tests.toml              |   2 +
 libos/test/regression/tests_musl.toml         |   2 +
 19 files changed, 602 insertions(+), 71 deletions(-)
 create mode 100644 libos/test/regression/eventfd_fork_allowed_failing.manifest.template
 create mode 100644 libos/test/regression/eventfd_races.c

diff --git a/CI-Examples/memcached/memcached.manifest.template b/CI-Examples/memcached/memcached.manifest.template
index 8130ac1e55..41491d8b99 100644
--- a/CI-Examples/memcached/memcached.manifest.template
+++ b/CI-Examples/memcached/memcached.manifest.template
@@ -18,10 +18,6 @@ loader.env.LD_LIBRARY_PATH = "/lib:/usr/{{ arch_libdir }}"
 loader.uid = 1000
 loader.gid = 1000
 
-# Memcached requires `eventfd` for worker thread notifications, starting from v1.6.11. If you have a
-# Memcached version older than that, we recommend to remove this insecure manifest option.
-sys.insecure__allow_eventfd = true
-
 sys.enable_sigterm_injection = true
 
 fs.mounts = [
diff --git a/CI-Examples/nginx/Makefile b/CI-Examples/nginx/Makefile
index d5fc23758c..4a87a0c4a5 100644
--- a/CI-Examples/nginx/Makefile
+++ b/CI-Examples/nginx/Makefile
@@ -29,8 +29,10 @@ ifeq ($(SGX),1)
 all: nginx.manifest.sgx nginx.sig
 endif
 
-# Note that Gramine doesn't support eventfd() and PR_SET_DUMPABLE, so we manually
-# overwrite these macros in the autogenerated configuration header of Nginx.
+# Note that Gramine doesn't support PR_SET_DUMPABLE, so we manually overwrite this macro in the
+# autogenerated configuration header of Nginx. Additionally, we disable support for eventfd, so that
+# Nginx falls back to alternative Inter Process Communication (IPC) means; recall that Gramine's
+# multi-process support for eventfd is insecure, thus we prefer not to use it here.
 $(INSTALL_DIR)/sbin/nginx: $(NGINX_SRC)/configure
 	cd $(NGINX_SRC) && ./configure --prefix=$(abspath $(INSTALL_DIR)) \
 		--without-http_rewrite_module --with-http_ssl_module
diff --git a/CI-Examples/rust/rust-hyper-http-server.manifest.template b/CI-Examples/rust/rust-hyper-http-server.manifest.template
index ae9f2a4530..bb011b96e7 100644
--- a/CI-Examples/rust/rust-hyper-http-server.manifest.template
+++ b/CI-Examples/rust/rust-hyper-http-server.manifest.template
@@ -30,12 +30,6 @@ sgx.trusted_files = [
   "file:{{ arch_libdir }}/",
 ]
 
-# The Tokio runtime requires eventfd, and the Gramine implementation
-# currently relies on the host in an insecure manner. This setting isn't
-# suitable for production deployment, but works well as a stopgap during
-# development while a proper implementation in Gramine is being worked on.
-sys.insecure__allow_eventfd = true
-
 # The maximum number of threads in a single process needs to be declared in advance.
 # You need to account for:
 # - one main thread
diff --git a/Documentation/devel/features.md b/Documentation/devel/features.md
index 3afbb60267..2266d6353c 100644
--- a/Documentation/devel/features.md
+++ b/Documentation/devel/features.md
@@ -2657,21 +2657,35 @@ Gramine and is supported.
 
 ### Event notifications (eventfd)
 
-Gramine currently implements an *insecure* version of the `eventfd()` system call. It is considered
-insecure in the context of SGX backend because it relies on the host OS, which could for example
-maliciously drop an event or inject a random one. To enable this `eventfd()` implementation, the
-manifest file must contain `sys.insecure__allow_eventfd = true` ({ref}`see manifest documentation
-<allowing-eventfd>`).
+There are two modes of eventfd:
 
-Gramine supports polling on eventfd via `poll()`, `ppoll()`, `select()`, `epoll_*()` system calls.
+1. Secure "emulate-in-Gramine" -- the eventfd object is created inside Gramine, and all operations
+   are resolved entirely inside Gramine. A dummy eventfd object is created on the host, purely to
+   trigger read/write notifications (e.g., in epoll); eventfd values are verified inside Gramine and
+   are never exposed to the host. Since the host is used purely for notifications, a malicious host
+   can only induce Denial of Service (DoS) attacks; thus this implementation is secure and enabled
+   by default. This implementation is automatically disabled if `sys.insecure__allow_eventfd`
+   {ref}`manifest option <allowing-eventfd>` is enabled.
 
-Gramine may implement a secure version of `eventfd()` for communication between Gramine processes in
-the future. Such secure version will *not* be able to receive events from the host OS.
+   The emulation is currently implemented at the level of a single process. The emulation *may* work
+   for multi-process applications, e.g., if the child process inherits the eventfd object but
+   doesn't use it. However, all eventfds created in the parent process are marked as invalid in
+   child processes, i.e. inter-process communication via eventfds is not allowed.
+
+   Note that this secure version is *not* able to receive events from the host OS.
+
+2. Insecure "passthrough-to-host" -- the eventfd object is created on the host, and all operations
+   are delegated to the host. Since this implementation is insecure, it is disallowed by default. To
+   use this implementation, it must be explicitly allowed via the `sys.insecure__allow_eventfd`
+   {ref}`manifest option <allowing-eventfd>`.
+
+Gramine supports polling on eventfd via `poll()`, `ppoll()`, `select()`, `epoll_*()` system calls,
+in both secure and insecure modes.
 
 <details><summary>Related system calls</summary>
 
-- ▣ `eventfd()`: insecure implementation
-- ▣ `eventfd2()`: insecure implementation
+- ☑ `eventfd()`: see notes above
+- ☑ `eventfd2()`: see notes above
 - ☑ `close()`
 
 - ☑ `read()`
diff --git a/Documentation/manifest-syntax.rst b/Documentation/manifest-syntax.rst
index 84bc043712..9e702f84b7 100644
--- a/Documentation/manifest-syntax.rst
+++ b/Documentation/manifest-syntax.rst
@@ -327,17 +327,28 @@ a 1 |~| MiB brk size.
 
 .. _allowing-eventfd:
 
-Allowing eventfd
-^^^^^^^^^^^^^^^^
+Allowing host-based insecure eventfd
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
 
 ::
 
     sys.insecure__allow_eventfd = [true|false]
     (Default: false)
 
-This specifies whether to allow system calls `eventfd()` and `eventfd2()`. Since
-eventfd emulation currently relies on the host, these system calls are
-disallowed by default due to security concerns.
+By default, Gramine implements eventfd in a secure but restricted way: currently
+this secure implementation only works when eventfd usage is confined to a single
+process (note that application may still be multi-process and spawn child
+processes, but eventfds created in parent will be invalid in children).
+
+However, sometimes it is acceptable for applications to use host-based insecure
+eventfd implementation. This implementation works without the above-mentioned
+restriction in multi-process applications. Use ``sys.insecure__allow_eventfd``
+manifest syntax to switch to this insecure implementation.
+
+.. note ::
+   ``sys.insecure__allow_eventfd`` is pass-through and thus potentially insecure
+   in e.g. SGX environments. It is the responsibility of the app developer to
+   analyze the app usage of eventfd, with security implications in mind.
 
 .. _external-sigterm-injection:
 
diff --git a/libos/include/libos_fs.h b/libos/include/libos_fs.h
index cbee338138..116d17c628 100644
--- a/libos/include/libos_fs.h
+++ b/libos/include/libos_fs.h
@@ -182,6 +182,10 @@ struct libos_fs_ops {
     /* Poll a single handle. Must not block. */
     int (*poll)(struct libos_handle* hdl, int in_events, int* out_events);
 
+    /* Verify a single handle after poll. Must update `pal_ret_events` in-place with only allowed
+     * ones. Used in e.g. secure eventfd FS to verify if the host is not lying to us. */
+    void (*post_poll)(struct libos_handle* hdl, pal_wait_flags_t* pal_ret_events);
+
     /* checkpoint/migrate the file system */
     ssize_t (*checkpoint)(void** checkpoint, void* mount_data);
     int (*migrate)(void* checkpoint, void** mount_data);
diff --git a/libos/include/libos_handle.h b/libos/include/libos_handle.h
index 81c28c91d8..5bafa23680 100644
--- a/libos/include/libos_handle.h
+++ b/libos/include/libos_handle.h
@@ -135,7 +135,11 @@ struct libos_epoll_handle {
 };
 
 struct libos_eventfd_handle {
+    bool broken_in_child;
     bool is_semaphore;
+    spinlock_t lock; /* protects below fields */
+    uint64_t val;
+    uint64_t dummy_host_val;
 };
 
 struct libos_handle {
diff --git a/libos/src/fs/eventfd/fs.c b/libos/src/fs/eventfd/fs.c
index abb01b74d7..5f32aa5819 100644
--- a/libos/src/fs/eventfd/fs.c
+++ b/libos/src/fs/eventfd/fs.c
@@ -1,8 +1,9 @@
 /* SPDX-License-Identifier: LGPL-3.0-or-later */
-/* Copyright (C) 2019 Intel Corporation */
+/* Copyright (C) 2024 Intel Corporation */
 
 /*
- * This file contains code for passthrough-to-host implementation of 'eventfd' filesystem.
+ * This file contains code for passthrough-to-host and emulate-in-libos implementations of 'eventfd'
+ * filesystem. For more information on the modes, see `libos_eventfd.c`.
  */
 
 #include "libos_fs.h"
@@ -12,44 +13,243 @@
 #include "linux_abi/errors.h"
 #include "pal.h"
 
+/* In emulate-in-libos mode, enforce a restriction: all eventfds created in the parent process are
+ * marked as invalid in child processes, i.e. inter-process communication via eventfds is not
+ * allowed. This restriction is because LibOS doesn't yet implement sync between eventfd objects. */
+static int eventfd_checkin(struct libos_handle* hdl) {
+    assert(hdl->type == TYPE_EVENTFD);
+    if (!g_eventfd_passthrough_mode)
+        hdl->info.eventfd.broken_in_child = true;
+    return 0;
+}
+
+static void eventfd_dummy_host_read(struct libos_handle* hdl) {
+    int ret;
+    uint64_t buf_dummy_host_val = 0;
+    size_t dummy_host_val_count = sizeof(buf_dummy_host_val);
+    do {
+        ret = PalStreamRead(hdl->pal_handle, /*offset=*/0, &dummy_host_val_count,
+                            &buf_dummy_host_val);
+    } while (ret == -PAL_ERROR_INTERRUPTED);
+    if (ret < 0 || dummy_host_val_count != sizeof(buf_dummy_host_val)) {
+        /* must not happen in benign case, consider it an attack and panic */
+        BUG();
+    }
+}
+
+static void eventfd_dummy_host_write(struct libos_handle* hdl) {
+    int ret;
+    uint64_t buf_dummy_host_val = 1;
+    size_t dummy_host_val_count = sizeof(buf_dummy_host_val);
+    do {
+        ret = PalStreamWrite(hdl->pal_handle, /*offset=*/0, &dummy_host_val_count,
+                             &buf_dummy_host_val);
+    } while (ret == -PAL_ERROR_INTERRUPTED);
+    if (ret < 0 || dummy_host_val_count != sizeof(buf_dummy_host_val)) {
+        /* must not happen in benign case, consider it an attack and panic */
+        BUG();
+    }
+}
+
+static void eventfd_dummy_host_wait(struct libos_handle* hdl) {
+    pal_wait_flags_t wait_for_events = PAL_WAIT_READ;
+    pal_wait_flags_t ret_events = 0;
+    int ret = PalStreamsWaitEvents(1, &hdl->pal_handle, &wait_for_events, &ret_events, NULL);
+    if (ret < 0 && ret != -PAL_ERROR_INTERRUPTED) {
+        BUG();
+    }
+    (void)ret_events; /* we don't care what events the host returned, we can't trust them anyway */
+}
+
 static ssize_t eventfd_read(struct libos_handle* hdl, void* buf, size_t count, file_off_t* pos) {
     __UNUSED(pos);
 
     if (count < sizeof(uint64_t))
         return -EINVAL;
 
-    int ret = PalStreamRead(hdl->pal_handle, /*offset=*/0, &count, buf);
-    if (!ret && count != sizeof(uint64_t)) {
-        /* successful read must return 8 bytes, otherwise it's an attack or host malfunction */
-        return -EPERM;
+    if (g_eventfd_passthrough_mode) {
+        /* passthrough-to-host mode */
+        int ret = PalStreamRead(hdl->pal_handle, /*offset=*/0, &count, buf);
+        if (!ret && count != sizeof(uint64_t)) {
+            /* successful read must return 8 bytes, otherwise it's an attack or host malfunction */
+            return -EPERM;
+        }
+        ret = pal_to_unix_errno(ret);
+        /* eventfd objects never perform partial reads, see also check above */
+        maybe_epoll_et_trigger(hdl, ret, /*in=*/true, /*unused was_partial=*/false);
+        return ret < 0 ? ret : (ssize_t)count;
     }
-    ret = pal_to_unix_errno(ret);
-    /* eventfd objects never perform partial reads, see also check above */
+
+    /* emulate-in-libos mode */
+    if (hdl->info.eventfd.broken_in_child) {
+        log_warning("Child process tried to access eventfd created by parent process. This is "
+                    "disallowed in Gramine (but see `sys.insecure__allow_eventfd`).");
+        return -EIO;
+    }
+
+    int ret;
+    spinlock_lock(&hdl->info.eventfd.lock);
+
+    while (!hdl->info.eventfd.val) {
+        if (hdl->flags & O_NONBLOCK) {
+            ret = -EAGAIN;
+            goto out;
+        }
+        spinlock_unlock(&hdl->info.eventfd.lock);
+        eventfd_dummy_host_wait(hdl);
+        spinlock_lock(&hdl->info.eventfd.lock);
+    }
+
+    if (!hdl->info.eventfd.is_semaphore) {
+        memcpy(buf, &hdl->info.eventfd.val, sizeof(uint64_t));
+        hdl->info.eventfd.val = 0;
+    } else {
+        uint64_t one_val = 1;
+        memcpy(buf, &one_val, sizeof(uint64_t));
+        hdl->info.eventfd.val--;
+    }
+
+    /* perform a read (not supposed to block) to clear the event from polling threads and to send an
+     * event to writing threads */
+    if (hdl->info.eventfd.dummy_host_val) {
+        eventfd_dummy_host_read(hdl);
+        hdl->info.eventfd.dummy_host_val = 0;
+    }
+
+    ret = (ssize_t)count;
+out:
+    spinlock_unlock(&hdl->info.eventfd.lock);
     maybe_epoll_et_trigger(hdl, ret, /*in=*/true, /*unused was_partial=*/false);
-    return ret < 0 ? ret : (ssize_t)count;
+    return ret;
 }
 
 static ssize_t eventfd_write(struct libos_handle* hdl, const void* buf, size_t count,
                              file_off_t* pos) {
     __UNUSED(pos);
 
-    if (count < sizeof(uint64_t))
+    /* note: before v6.9, Linux kernel had a `<` comparison; here we adhere to the latest Linux
+     * version (v6.9) that switched to `!=` */
+    if (count != sizeof(uint64_t))
         return -EINVAL;
 
-    int ret = PalStreamWrite(hdl->pal_handle, /*offset=*/0, &count, (void*)buf);
-    if (!ret && count != sizeof(uint64_t)) {
-        /* successful write must return 8 bytes, otherwise it's an attack or host malfunction */
-        return -EPERM;
+    uint64_t buf_val;
+    memcpy(&buf_val, buf, sizeof(uint64_t));
+    if (buf_val == UINT64_MAX)
+        return -EINVAL;
+
+    if (buf_val == 0) {
+        /* Note: we explicitly emulate eventfd write-of-zero as a no-op. Linux kernel ultimately
+         * also no-ops on write-of-zero despite the lack of an explicit early-return condition
+         * (https://elixir.bootlin.com/linux/v6.8/source/fs/eventfd.c#L247). This explicit check is
+         * required in Gramine due to how our polling works. */
+        return (ssize_t)count;
+    }
+
+    if (g_eventfd_passthrough_mode) {
+        /* passthrough-to-host mode */
+        int ret = PalStreamWrite(hdl->pal_handle, /*offset=*/0, &count, (void*)buf);
+        if (!ret && count != sizeof(uint64_t)) {
+            /* successful write must return 8 bytes, otherwise it's an attack or host malfunction */
+            return -EPERM;
+        }
+        ret = pal_to_unix_errno(ret);
+        /* eventfd objects never perform partial writes, see also check above */
+        maybe_epoll_et_trigger(hdl, ret, /*in=*/false, /*unused was_partial=*/false);
+        return ret < 0 ? ret : (ssize_t)count;
+    }
+
+    /* emulate-in-libos mode */
+    if (hdl->info.eventfd.broken_in_child) {
+        log_warning("Child process tried to access eventfd created by parent process. This is "
+                    "disallowed in Gramine (but see `sys.insecure__allow_eventfd`).");
+        return -EIO;
     }
-    ret = pal_to_unix_errno(ret);
-    /* eventfd objects never perform partial writes, see also check above */
+
+    int ret;
+    spinlock_lock(&hdl->info.eventfd.lock);
+
+    uint64_t val;
+    while (__builtin_add_overflow(hdl->info.eventfd.val, buf_val, &val) || val > UINT64_MAX - 1) {
+        if (hdl->flags & O_NONBLOCK) {
+            ret = -EAGAIN;
+            goto out;
+        }
+        spinlock_unlock(&hdl->info.eventfd.lock);
+        /*
+         * For the corner case of write overflowing the counter (and thus blocking), we choose to
+         * emulate blocking via busy loop. This may burn cycles and starve other threads.
+         * Additionally, host polls on eventfd write events will *always* report that eventfd is
+         * available for writing, even if it is actually supposed to be blocked; our emulation
+         * removes the "available for writing" return event (see `eventfd_post_poll()`) so the user
+         * app sees that the poll returned but doesn't see the write event (i.e. spurious return).
+         * Overflowing eventfd writes are considered sufficiently rare, so our busy-loop emulation
+         * should not hamper performance of normal apps.
+         *
+         * An alternative would be to (1) use a separate synchronization mechanism on which the
+         * writer would block, or to (2) keep the host value in sync with the LibOS value. The
+         * former alternative would introduce complexity, whereas the latter alternative would
+         * violate confidentiality (by leaking eventfd counter).
+         */
+        PalThreadYieldExecution();
+        spinlock_lock(&hdl->info.eventfd.lock);
+    }
+
+    hdl->info.eventfd.val = val;
+
+    /* perform a write (not supposed to block) to send an event to reading/polling threads */
+    if (hdl->info.eventfd.dummy_host_val >= UINT64_MAX - 1)
+        BUG();
+    hdl->info.eventfd.dummy_host_val++;
+    eventfd_dummy_host_write(hdl);
+
+    ret = (ssize_t)count;
+out:
+    spinlock_unlock(&hdl->info.eventfd.lock);
     maybe_epoll_et_trigger(hdl, ret, /*in=*/false, /*unused was_partial=*/false);
-    return ret < 0 ? ret : (ssize_t)count;
+    return ret;
+}
+
+static void eventfd_post_poll(struct libos_handle* hdl, pal_wait_flags_t* pal_ret_events) {
+    if (g_eventfd_passthrough_mode)
+        return;
+
+    if (hdl->info.eventfd.broken_in_child) {
+        log_warning("Child process tried to access eventfd created by parent process. This is "
+                    "disallowed in Gramine (but see `sys.insecure__allow_eventfd`).");
+        *pal_ret_events = PAL_WAIT_ERROR;
+        return;
+    }
+
+    if (*pal_ret_events & (PAL_WAIT_ERROR | PAL_WAIT_HANG_UP)) {
+        /* impossible: we control eventfd inside the LibOS, and we never raise such conditions */
+        BUG();
+    }
+
+    spinlock_lock(&hdl->info.eventfd.lock);
+    if (*pal_ret_events & PAL_WAIT_READ) {
+        /* there is data to read: verify if counter has value greater than zero */
+        if (!hdl->info.eventfd.val) {
+            /* spurious or malicious notification, can legitimately happen if another thread
+             * consumed this event between this thread's poll wakeup and the post_poll callback;
+             * we currently choose to return a spurious notification to the user */
+            *pal_ret_events &= ~PAL_WAIT_READ;
+        }
+    }
+    if (*pal_ret_events & PAL_WAIT_WRITE) {
+        /* verify it's really possible to write a value of at least "1" without blocking */
+        if (hdl->info.eventfd.val >= UINT64_MAX - 1) {
+            /* spurious or malicious notification, see comment above */
+            *pal_ret_events &= ~PAL_WAIT_WRITE;
+        }
+    }
+    spinlock_unlock(&hdl->info.eventfd.lock);
 }
 
 struct libos_fs_ops eventfd_fs_ops = {
-    .read  = &eventfd_read,
-    .write = &eventfd_write,
+    .checkin   = &eventfd_checkin,
+    .read      = &eventfd_read,
+    .write     = &eventfd_write,
+    .post_poll = &eventfd_post_poll,
 };
 
 struct libos_fs eventfd_builtin_fs = {
diff --git a/libos/src/sys/libos_epoll.c b/libos/src/sys/libos_epoll.c
index fab6965eee..06c78f5559 100644
--- a/libos/src/sys/libos_epoll.c
+++ b/libos/src/sys/libos_epoll.c
@@ -181,9 +181,9 @@ void maybe_epoll_et_trigger(struct libos_handle* handle, int ret, bool in, bool
                  * Some workloads (e.g. rust's tokio crate) use eventfd with EPOLLET in a peculiar
                  * way: each write to that eventfd increases counter by 1 and thanks to EPOLLET is
                  * reported by epoll only once, even if there is no read from eventfd.
+                 *
                  * To handle such usage pattern, we mark eventfd as read-epollet-pollable on each
                  * write - we assume that eventfd is not shared between processes.
-                 * Hopefully no app tries to increase the eventfd counter by 0...
                  */
                 __atomic_store_n(&handle->needs_et_poll_in, true, __ATOMIC_RELEASE);
                 needs_et = true;
@@ -658,6 +658,11 @@ static int do_epoll_wait(int epfd, struct epoll_event* events, int maxevents, in
                 continue;
             }
 
+            if (items[i]->handle->fs && items[i]->handle->fs->fs_ops
+                    && items[i]->handle->fs->fs_ops->post_poll) {
+                items[i]->handle->fs->fs_ops->post_poll(items[i]->handle, &pal_ret_events[i]);
+            }
+
             uint32_t this_item_events = 0;
             if (pal_ret_events[i] & PAL_WAIT_ERROR) {
                 this_item_events |= EPOLLERR;
@@ -674,6 +679,7 @@ static int do_epoll_wait(int epfd, struct epoll_event* events, int maxevents, in
                 this_item_events |= items[i]->events & (EPOLLOUT | EPOLLWRNORM);
             }
 
+            /* TODO: move this to socket FS's post_poll() callback */
             if (items[i]->handle->type == TYPE_SOCK &&
                     (pal_ret_events[i] & (PAL_WAIT_READ | PAL_WAIT_WRITE))) {
                 bool error_event = !!(pal_ret_events[i] & (PAL_WAIT_ERROR | PAL_WAIT_HANG_UP));
diff --git a/libos/src/sys/libos_eventfd.c b/libos/src/sys/libos_eventfd.c
index 8c9ca027b8..f4435460c8 100644
--- a/libos/src/sys/libos_eventfd.c
+++ b/libos/src/sys/libos_eventfd.c
@@ -1,23 +1,57 @@
 /* SPDX-License-Identifier: LGPL-3.0-or-later */
-/* Copyright (C) 2019 Intel Corporation */
+/* Copyright (C) 2024 Intel Corporation */
 
 /*
- * Implementation of system calls "eventfd" and "eventfd2". Since eventfd emulation currently relies
- * on the host, these system calls are disallowed by default due to security concerns. To use them,
- * they must be explicitly allowed through the "sys.insecure__allow_eventfd" manifest key.
+ * Implementation of system calls "eventfd" and "eventfd2".
+ *
+ * There are two modes of eventfd:
+ *
+ * 1. Passthrough-to-host -- the eventfd object is created on the host, and all operations are
+ *    delegated to the host. Since this implementation is insecure, it is disallowed by default. To
+ *    use this implementation, it must be explicitly allowed via the `sys.insecure__allow_eventfd`
+ *    manifest option.
+ *
+ * 2. Emulate-in-libos -- the eventfd object is created inside the LibOS, and all operations are
+ *    resolved entirely inside the LibOS. A dummy eventfd object is created on the host, purely to
+ *    trigger read/write notifications (e.g., in epoll); eventfd values are verified inside the
+ *    LibOS and are never exposed to the host. Since the host is used purely for notifications (see
+ *    notes below), this implementation is considered secure and enabled by default. It is
+ *    automatically disabled if the manifest option `sys.insecure__allow_eventfd` is enabled.
+ *
+ *    - The emulation is currently implemented at the level of a single process. The emulation *may*
+ *      work for multi-process applications, e.g., if the child process inherits the eventfd object
+ *      but doesn't use it. However, all eventfds created in the parent process are marked as
+ *      invalid in child processes, i.e. inter-process communication via eventfds is not allowed.
+ *
+ *    - The host's eventfd object is "dummy" and used purely for notifications -- to unblock
+ *      blocking read/write/select/poll/epoll system calls. The read/write notify logic is already
+ *      hardened, by double-checking that the object was indeed updated. However, there are three
+ *      possible attacks on polling mechanisms (select/poll/epoll):
+ *
+ *      a. Malicious host may inject the notification too early: POLLIN when nothing was written
+ *         yet or POLLOUT when nothing was read yet. This may lead to a synchronization failure
+ *         of the app: e.g., Rust Tokio's Metal I/O (mio) lib would incorrectly wake up a thread.
+ *         To prevent this, eventfd implements a callback `post_poll()` where it verifies that some
+ *         data was indeed written/read (i.e., that the notification is not spurious).
+ *      b. Malicious host may inject the notification too late or not send a notification at all.
+ *         This is a Denial of Service (DoS), which we don't care about.
+ *      c. Malicious host may inject POLLERR, POLLHUP, POLLRDHUP, POLLNVAL. This is impossible as we
+ *         control eventfd objects inside the LibOS, and we never raise such conditions. So the
+ *         callback `post_poll()` panics if it detects such a return event.
  */
 
+#include "libos_checkpoint.h"
 #include "libos_fs.h"
 #include "libos_handle.h"
 #include "libos_internal.h"
 #include "libos_table.h"
 #include "libos_utils.h"
-#include "linux_eventfd.h"
 #include "linux_abi/fs.h"
+#include "linux_eventfd.h"
 #include "pal.h"
 #include "toml_utils.h"
 
-bool g_eventfd_passthrough_mode = false;
+bool g_eventfd_passthrough_mode __attribute_migratable = false;
 
 int init_eventfd_mode(void) {
     assert(g_manifest_root);
@@ -38,14 +72,6 @@ static int create_eventfd_pal_handle(uint64_t initial_count, int flags,
                                      PAL_HANDLE* out_pal_handle) {
     int ret;
 
-    if (!g_eventfd_passthrough_mode) {
-        if (FIRST_TIME()) {
-            log_warning("The app tried to use eventfd, but it's turned off "
-                        "(sys.insecure__allow_eventfd = false)");
-        }
-        return -ENOSYS;
-    }
-
     PAL_HANDLE hdl = NULL;
 
     int pal_flags = 0;
@@ -84,12 +110,20 @@ long libos_syscall_eventfd2(unsigned int count, int flags) {
 
     hdl->type = TYPE_EVENTFD;
     hdl->fs = &eventfd_builtin_fs;
-    hdl->flags = O_RDWR;
+    hdl->flags = O_RDWR | (flags & EFD_NONBLOCK ? O_NONBLOCK : 0);
     hdl->acc_mode = MAY_READ | MAY_WRITE;
 
     hdl->info.eventfd.is_semaphore = !!(flags & EFD_SEMAPHORE);
-
-    ret = create_eventfd_pal_handle(count, flags, &hdl->pal_handle);
+    hdl->info.eventfd.val = count;
+    hdl->info.eventfd.dummy_host_val = 0;
+    hdl->info.eventfd.broken_in_child = false;
+
+    if (g_eventfd_passthrough_mode) {
+        ret = create_eventfd_pal_handle(hdl->info.eventfd.val, flags, &hdl->pal_handle);
+    } else {
+        ret = create_eventfd_pal_handle(hdl->info.eventfd.dummy_host_val, /*flags=*/0,
+                                        &hdl->pal_handle);
+    }
     if (ret < 0)
         goto out;
 
diff --git a/libos/src/sys/libos_poll.c b/libos/src/sys/libos_poll.c
index b1e59ccc96..842151c1be 100644
--- a/libos/src/sys/libos_poll.c
+++ b/libos/src/sys/libos_poll.c
@@ -203,6 +203,11 @@ static long do_poll(struct pollfd* fds, size_t fds_len, uint64_t* timeout_us) {
             continue;
         }
 
+        if (libos_handles[i]->fs && libos_handles[i]->fs->fs_ops
+                && libos_handles[i]->fs->fs_ops->post_poll) {
+            libos_handles[i]->fs->fs_ops->post_poll(libos_handles[i], &ret_events[i]);
+        }
+
         fds[i].revents = 0;
         if (ret_events[i] & PAL_WAIT_ERROR)
             fds[i].revents |= POLLERR;
@@ -216,6 +221,7 @@ static long do_poll(struct pollfd* fds, size_t fds_len, uint64_t* timeout_us) {
         if (ret_events[i] & PAL_WAIT_WRITE)
             fds[i].revents |= fds[i].events & (POLLOUT | POLLWRNORM);
 
+        /* TODO: move this to socket FS's post_poll() callback */
         if (libos_handles[i]->type == TYPE_SOCK &&
                 (ret_events[i] & (PAL_WAIT_READ | PAL_WAIT_WRITE))) {
             bool error_event = !!(ret_events[i] & (PAL_WAIT_ERROR | PAL_WAIT_HANG_UP));
diff --git a/libos/test/ltp/ltp.cfg b/libos/test/ltp/ltp.cfg
index e40899a1b9..07ba977792 100644
--- a/libos/test/ltp/ltp.cfg
+++ b/libos/test/ltp/ltp.cfg
@@ -264,10 +264,6 @@ skip = yes
 [eventfd01]
 skip = yes
 
-# eventfd not allowed in manifest
-[eventfd2_*]
-skip = yes
-
 # error while loading libc.so.6
 [execle01]
 skip = yes
diff --git a/libos/test/regression/eventfd_fork_allowed_failing.manifest.template b/libos/test/regression/eventfd_fork_allowed_failing.manifest.template
new file mode 100644
index 0000000000..91fe5638c4
--- /dev/null
+++ b/libos/test/regression/eventfd_fork_allowed_failing.manifest.template
@@ -0,0 +1,26 @@
+{% set entrypoint = "eventfd_fork" -%}
+
+loader.entrypoint = "file:{{ gramine.libos }}"
+libos.entrypoint = "{{ entrypoint }}"
+
+loader.log_level = "warning"  # to print the warning about eventfd usage in child process
+
+loader.env.LD_LIBRARY_PATH = "/lib"
+
+fs.mounts = [
+  { path = "/lib", uri = "file:{{ gramine.runtimedir(libc) }}" },
+  { path = "/{{ entrypoint }}", uri = "file:{{ binary_dir }}/{{ entrypoint }}" },
+]
+
+# below is commented out to test the negative case: eventfd accessed in child process must fail
+# because eventfd is in a secure single-process mode (due to commented-out line)
+#sys.insecure__allow_eventfd = true
+
+sgx.debug = true
+sgx.edmm_enable = {{ 'true' if env.get('EDMM', '0') == '1' else 'false' }}
+
+sgx.trusted_files = [
+  "file:{{ gramine.libos }}",
+  "file:{{ gramine.runtimedir(libc) }}/",
+  "file:{{ binary_dir }}/{{ entrypoint }}",
+]
diff --git a/libos/test/regression/eventfd_races.c b/libos/test/regression/eventfd_races.c
new file mode 100644
index 0000000000..78c3f5f62e
--- /dev/null
+++ b/libos/test/regression/eventfd_races.c
@@ -0,0 +1,222 @@
+#include <poll.h>
+#include <pthread.h>
+#include <stdbool.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <sys/eventfd.h>
+#include <unistd.h>
+
+#include "common.h"
+
+#define TEST_RUNS 10000
+#define TEST_VAL  42
+
+static int g_efd;
+static uint64_t g_read_events;  /* atomic counter */
+static uint64_t g_write_events; /* atomic counter */
+static uint64_t g_total_events; /* atomic counter */
+
+/* required to NOT increase the above counters on the "last read/write event to unblock the eventfd
+ * consumers"; this is purely to avoid these counters incrementing past TEST_RUNS */
+static bool g_stop_test;        /* atomic boolean */
+
+static void pthread_check(int x) {
+    if (x) {
+        printf("pthread failed with %d (%s)\n", x, strerror(x));
+        exit(1);
+    }
+}
+
+static void* write_eventfd_thread(void* arg) {
+    uint64_t val = TEST_VAL;
+    for (int i = 0; i < TEST_RUNS; i++) {
+        uint64_t curr_read_events = __atomic_load_n(&g_read_events, __ATOMIC_SEQ_CST);
+        if (write(g_efd, &val, sizeof(val)) != sizeof(val))
+            errx(1, "eventfd write failed");
+        while (__atomic_load_n(&g_read_events, __ATOMIC_SEQ_CST) == curr_read_events)
+            /* wait until some reader thread updates the read_events counter */;
+    }
+    /* send one last event to unblock the second reader */
+    __atomic_store_n(&g_stop_test, true, __ATOMIC_SEQ_CST);
+    if (write(g_efd, &val, sizeof(val)) != sizeof(val))
+        errx(1, "eventfd write failed");
+    return NULL;
+}
+
+static void* read_eventfd_thread(void* arg) {
+    uint64_t val;
+    uint64_t read_events_total = 0;
+    while (true) {
+        uint64_t curr_read_events = __atomic_load_n(&g_read_events, __ATOMIC_SEQ_CST);
+        if (curr_read_events == TEST_RUNS)
+            break;
+        if (read(g_efd, &val, sizeof(val)) != sizeof(val) || val != TEST_VAL)
+            errx(1, "eventfd read failed");
+        if (__atomic_load_n(&g_stop_test, __ATOMIC_SEQ_CST))
+            break;
+        __atomic_add_fetch(&g_read_events, 1, __ATOMIC_SEQ_CST);
+        read_events_total++;
+    }
+    printf("Read-only thread: read eventfd %lu times\n", read_events_total);
+    __atomic_add_fetch(&g_total_events, read_events_total, __ATOMIC_SEQ_CST);
+    return NULL;
+}
+
+static void* poll_then_read_eventfd_thread(void* arg) {
+    struct pollfd pollfd = { .fd = g_efd, .events = POLLIN };
+
+    uint64_t val;
+    uint64_t poll_events_total = 0;
+    uint64_t read_events_total = 0;
+    while (true) {
+        uint64_t curr_read_events = __atomic_load_n(&g_read_events, __ATOMIC_SEQ_CST);
+        if (curr_read_events == TEST_RUNS)
+            break;
+        int ret = CHECK(poll(&pollfd, /*nfds=*/1, /*timeout=*/-1));
+        poll_events_total++;
+        if (!ret)
+            continue;
+        if (ret != 1)
+            errx(1, "poll on eventfd returned more than 1 event (%d)", ret);
+        if (!(pollfd.revents & POLLIN))
+            continue;
+        /* below read may block because of the race: another thread can read first and reset the
+         * event value; we don't care as this is benign */
+        if (read(g_efd, &val, sizeof(val)) != sizeof(val) || val != TEST_VAL)
+            errx(1, "eventfd read failed");
+        if (__atomic_load_n(&g_stop_test, __ATOMIC_SEQ_CST))
+            break;
+        __atomic_add_fetch(&g_read_events, 1, __ATOMIC_SEQ_CST);
+        read_events_total++;
+    }
+    printf("Poll-then-read thread: polled eventfd %lu times\n", poll_events_total);
+    printf("Poll-then-read thread: read eventfd %lu times\n", read_events_total);
+    __atomic_add_fetch(&g_total_events, read_events_total, __ATOMIC_SEQ_CST);
+    return NULL;
+}
+
+static void* blocking_write_eventfd_thread(void* arg) {
+    uint64_t val = 1; /* must be `1` because of semaphore semantics -- reader decrements by 1 */
+    uint64_t write_events_total = 0;
+    while (true) {
+        uint64_t curr_write_events = __atomic_load_n(&g_write_events, __ATOMIC_SEQ_CST);
+        if (curr_write_events == TEST_RUNS)
+            break;
+        if (write(g_efd, &val, sizeof(val)) != sizeof(val))
+            errx(1, "eventfd write failed");
+        if (__atomic_load_n(&g_stop_test, __ATOMIC_SEQ_CST))
+            break;
+        __atomic_add_fetch(&g_write_events, 1, __ATOMIC_SEQ_CST);
+        write_events_total++;
+    }
+    printf("Blocking write thread: wrote eventfd %lu times\n", write_events_total);
+    __atomic_add_fetch(&g_total_events, write_events_total, __ATOMIC_SEQ_CST);
+    return NULL;
+}
+
+static void* read_for_blocking_write_eventfd_thread(void* arg) {
+    uint64_t val;
+    for (int i = 0; i < TEST_RUNS; i++) {
+        uint64_t curr_write_events = __atomic_load_n(&g_write_events, __ATOMIC_SEQ_CST);
+        if (read(g_efd, &val, sizeof(val)) != sizeof(val) || val != 1)
+            errx(1, "eventfd read failed");
+        while (__atomic_load_n(&g_write_events, __ATOMIC_SEQ_CST) == curr_write_events)
+            /* wait until some writer thread updates the write_events counter */;
+    }
+    /* get one last event to unblock the second writer */
+    __atomic_store_n(&g_stop_test, true, __ATOMIC_SEQ_CST);
+    if (read(g_efd, &val, sizeof(val)) != sizeof(val))
+        errx(1, "eventfd read failed");
+    return NULL;
+}
+
+static void eventfd_two_readers_doing_two_reads(void) {
+    g_read_events = g_total_events = 0;
+    g_stop_test = false;
+    g_efd = CHECK(eventfd(0, 0)); /* a blocking non-semaphore eventfd */
+
+    pthread_t th[3]; /* two readers (both do blocking reads) and one writer */
+    pthread_check(pthread_create(&th[0], NULL, write_eventfd_thread, NULL));
+    pthread_check(pthread_create(&th[1], NULL, read_eventfd_thread, NULL));
+    pthread_check(pthread_create(&th[2], NULL, read_eventfd_thread, NULL));
+
+    for (int i = 0; i < 3; i++) {
+        pthread_check(pthread_join(th[i], NULL));
+    }
+    CHECK(close(g_efd));
+    if (g_total_events != TEST_RUNS)
+        errx(1, "total events for subtest is %lu (expected %d)", g_total_events, TEST_RUNS);
+}
+
+static void eventfd_two_readers_doing_read_and_poll(void) {
+    g_read_events = g_total_events = 0;
+    g_stop_test = false;
+    g_efd = CHECK(eventfd(0, 0)); /* a blocking non-semaphore eventfd */
+
+    pthread_t th[3]; /* two readers (one does blocking read, one does poll) and one writer */
+    pthread_check(pthread_create(&th[0], NULL, write_eventfd_thread, NULL));
+    pthread_check(pthread_create(&th[1], NULL, read_eventfd_thread, NULL));
+    pthread_check(pthread_create(&th[2], NULL, poll_then_read_eventfd_thread, NULL));
+
+    for (int i = 0; i < 3; i++) {
+        pthread_check(pthread_join(th[i], NULL));
+    }
+    CHECK(close(g_efd));
+    if (g_total_events != TEST_RUNS)
+        errx(1, "total events for subtest is %lu (expected %d)", g_total_events, TEST_RUNS);
+}
+
+static void eventfd_two_readers_doing_two_polls(void) {
+    g_read_events = g_total_events = 0;
+    g_stop_test = false;
+    g_efd = CHECK(eventfd(0, 0)); /* a blocking non-semaphore eventfd */
+
+    pthread_t th[3]; /* two readers (both do poll) and one writer */
+    pthread_check(pthread_create(&th[0], NULL, write_eventfd_thread, NULL));
+    pthread_check(pthread_create(&th[1], NULL, poll_then_read_eventfd_thread, NULL));
+    pthread_check(pthread_create(&th[2], NULL, poll_then_read_eventfd_thread, NULL));
+
+    for (int i = 0; i < 3; i++) {
+        pthread_check(pthread_join(th[i], NULL));
+    }
+    CHECK(close(g_efd));
+    if (g_total_events != TEST_RUNS)
+        errx(1, "total events for subtest is %lu (expected %d)", g_total_events, TEST_RUNS);
+}
+
+static void eventfd_two_writers(void) {
+    g_write_events = g_total_events = 0;
+    g_stop_test = false;
+    /* a blocking semaphore eventfd (we don't want reads to reset value, so that writes block) */
+    g_efd = CHECK(eventfd(0, EFD_SEMAPHORE));
+    uint64_t val = UINT64_MAX - 1;
+    if (write(g_efd, &val, sizeof(val)) != sizeof(val))
+        errx(1, "initial eventfd write failed");
+
+    pthread_t th[3]; /* two writers (both do blocking writes) and one reader */
+    pthread_check(pthread_create(&th[0], NULL, blocking_write_eventfd_thread, NULL));
+    pthread_check(pthread_create(&th[1], NULL, blocking_write_eventfd_thread, NULL));
+    pthread_check(pthread_create(&th[2], NULL, read_for_blocking_write_eventfd_thread, NULL));
+
+    for (int i = 0; i < 3; i++) {
+        pthread_check(pthread_join(th[i], NULL));
+    }
+    CHECK(close(g_efd));
+    if (g_total_events != TEST_RUNS)
+        errx(1, "total events for subtest is %lu (expected %d)", g_total_events, TEST_RUNS);
+}
+
+int main(void) {
+    setbuf(stdout, NULL);
+    puts("------------------------");
+    eventfd_two_readers_doing_two_reads();
+    puts("------------------------");
+    eventfd_two_readers_doing_read_and_poll();
+    puts("------------------------");
+    eventfd_two_readers_doing_two_polls();
+    puts("------------------------");
+    eventfd_two_writers();
+    puts("TEST OK");
+    return 0;
+}
diff --git a/libos/test/regression/manifest.template b/libos/test/regression/manifest.template
index ae5f6cda89..692bfe1a5b 100644
--- a/libos/test/regression/manifest.template
+++ b/libos/test/regression/manifest.template
@@ -4,9 +4,6 @@ libos.entrypoint = "{{ entrypoint }}"
 loader.env.LD_LIBRARY_PATH = "/lib:{{ arch_libdir }}:/usr/{{ arch_libdir }}"
 loader.insecure__use_cmdline_argv = true
 
-# for eventfd test
-sys.insecure__allow_eventfd = true
-
 # for flock_lock test
 sys.experimental__enable_flock = true
 
diff --git a/libos/test/regression/meson.build b/libos/test/regression/meson.build
index c5434a683a..48b9f54a05 100644
--- a/libos/test/regression/meson.build
+++ b/libos/test/regression/meson.build
@@ -21,6 +21,7 @@ tests = {
     'epoll_test': {},
     'eventfd': {},
     'eventfd_fork': {},
+    'eventfd_races': {},
     'eventfd_read_then_write': {},
     'exec': {},
     'exec_fork': {},
diff --git a/libos/test/regression/test_libos.py b/libos/test/regression/test_libos.py
index 4678322a7c..73590b8581 100644
--- a/libos/test/regression/test_libos.py
+++ b/libos/test/regression/test_libos.py
@@ -899,6 +899,20 @@ def test_072_eventfd_read_then_write(self):
         stdout, _ = self.run_binary(['eventfd_read_then_write'])
         self.assertIn('TEST OK', stdout)
 
+    def test_073_eventfd_fork_allowed_failing(self):
+        try:
+            self.run_binary(['eventfd_fork_allowed_failing'])
+            self.fail('eventfd_fork_allowed_failing unexpectedly succeeded')
+        except subprocess.CalledProcessError as e:
+            stdout = e.stdout.decode()
+            stderr = e.stderr.decode()
+            self.assertIn('Child process tried to access eventfd created by parent process', stderr)
+            self.assertIn('eventfd write failed', stdout)
+
+    def test_074_eventfd_races(self):
+        stdout, _ = self.run_binary(['eventfd_races'])
+        self.assertIn('TEST OK', stdout)
+
     @unittest.skipIf(USES_MUSL, 'sched_setscheduler is not supported in musl')
     def test_080_sched(self):
         stdout, _ = self.run_binary(['sched'])
diff --git a/libos/test/regression/tests.toml b/libos/test/regression/tests.toml
index 3dd6707315..9b68f74ee0 100644
--- a/libos/test/regression/tests.toml
+++ b/libos/test/regression/tests.toml
@@ -22,6 +22,8 @@ manifests = [
   "epoll_test",
   "eventfd",
   "eventfd_fork",
+  "eventfd_fork_allowed_failing",
+  "eventfd_races",
   "eventfd_read_then_write",
   "exec",
   "exec_fork",
diff --git a/libos/test/regression/tests_musl.toml b/libos/test/regression/tests_musl.toml
index 07a9e8ca3a..b553efa678 100644
--- a/libos/test/regression/tests_musl.toml
+++ b/libos/test/regression/tests_musl.toml
@@ -24,6 +24,8 @@ manifests = [
   "epoll_test",
   "eventfd",
   "eventfd_fork",
+  "eventfd_fork_allowed_failing",
+  "eventfd_races",
   "eventfd_read_then_write",
   "exec",
   "exec_fork",