From 4d0040be1f3804cc0cfc6519a49ffe414958167b Mon Sep 17 00:00:00 2001
From: Jeremy Allison <jallison@ciq.com>
Date: Mon, 16 Jun 2025 13:34:27 -0700
Subject: [PATCH 01/11] crypto: jitter - replace LFSR with SHA3-256
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

        Using the kernel crypto API, the SHA3-256 algorithm is used as
        conditioning element to replace the LFSR in the Jitter RNG. All other
        parts of the Jitter RNG are unchanged.

        The application and use of the SHA-3 conditioning operation is identical
        to the user space Jitter RNG 3.4.0 by applying the following concept:

        - the Jitter RNG initializes a SHA-3 state which acts as the "entropy
          pool" when the Jitter RNG is allocated.

        - When a new time delta is obtained, it is inserted into the "entropy
          pool" with a SHA-3 update operation. Note, this operation in most of
          the cases is a simple memcpy() onto the SHA-3 stack.

        - To cause a true SHA-3 operation for each time delta operation, a
          second SHA-3 operation is performed hashing Jitter RNG status
          information. The final message digest is also inserted into the
          "entropy pool" with a SHA-3 update operation. Yet, this data is not
          considered to provide any entropy, but it shall stir the entropy pool.

        - To generate a random number, a SHA-3 final operation is performed to
          calculate a message digest followed by an immediate SHA-3 init to
          re-initialize the "entropy pool". The obtained message digest is one
          block of the Jitter RNG that is returned to the caller.

        Mathematically speaking, the random number generated by the Jitter RNG
        is:

        aux_t = SHA-3(Jitter RNG state data)

        Jitter RNG block = SHA-3(time_i || aux_i || time_(i-1) || aux_(i-1) ||
                                 ... || time_(i-255) || aux_(i-255))

        when assuming that the OSR = 1, i.e. the default value.

        This operation implies that the Jitter RNG has an output-blocksize of
        256 bits instead of the 64 bits of the LFSR-based Jitter RNG that is
        replaced with this patch.

        The patch also replaces the varying number of invocations of the
        conditioning function with one fixed number of invocations. The use
        of the conditioning function consistent with the userspace Jitter RNG
        library version 3.4.0.

        The code is tested with a system that exhibited the least amount of
        entropy generated by the Jitter RNG: the SiFive Unmatched RISC-V
        system. The measured entropy rate is well above the heuristically
        implied entropy value of 1 bit of entropy per time delta. On all other
        tested systems, the measured entropy rate is even higher by orders
        of magnitude. The measurement was performed using updated tooling
        provided with the user space Jitter RNG library test framework.

        The performance of the Jitter RNG with this patch is about en par
        with the performance of the Jitter RNG without the patch.

        Signed-off-by: Stephan Mueller <smueller@chronox.de>
        Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au>

            Back-port of commit bb897c55042e9330bcf88b4b13cbdd6f9fabdd5e
            Author: Stephan Müller <smueller@chronox.de>
            Date:   Fri Apr 21 08:08:04 2023 +0200

Signed-off-by: Jeremy Allison <jallison@ciq.com>
Signed-off-by: Jonathan Maple <jmaple@ciq.com>
---
 crypto/Kconfig               |   1 +
 crypto/jitterentropy-kcapi.c | 183 +++++++++++++++++++++++++++++++----
 crypto/jitterentropy.c       | 143 +++++++++------------------
 crypto/jitterentropy.h       |  10 +-
 4 files changed, 218 insertions(+), 119 deletions(-)

diff --git a/crypto/Kconfig b/crypto/Kconfig
index c0054b9f23cbb..17c113cd5fe52 100644
--- a/crypto/Kconfig
+++ b/crypto/Kconfig
@@ -2019,6 +2019,7 @@ config CRYPTO_ANSI_CPRNG
 	tristate "Pseudo Random Number Generation for Cryptographic modules"
 	select CRYPTO_AES
 	select CRYPTO_RNG
+	select CRYPTO_SHA3
 	help
 	  This option enables the generic pseudo random number generator
 	  for cryptographic modules.  Uses the Algorithm specified in
diff --git a/crypto/jitterentropy-kcapi.c b/crypto/jitterentropy-kcapi.c
index b9edfaa51b273..4b50cbc8a2faf 100644
--- a/crypto/jitterentropy-kcapi.c
+++ b/crypto/jitterentropy-kcapi.c
@@ -2,7 +2,7 @@
  * Non-physical true random number generator based on timing jitter --
  * Linux Kernel Crypto API specific code
  *
- * Copyright Stephan Mueller <smueller@chronox.de>, 2015
+ * Copyright Stephan Mueller <smueller@chronox.de>, 2015 - 2023
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
@@ -37,6 +37,8 @@
  * DAMAGE.
  */
 
+#include <crypto/hash.h>
+#include <crypto/sha3.h>
 #include <linux/fips.h>
 #include <linux/kernel.h>
 #include <linux/module.h>
@@ -46,6 +48,8 @@
 
 #include "jitterentropy.h"
 
+#define JENT_CONDITIONING_HASH	"sha3-256-generic"
+
 /***************************************************************************
  * Helper function
  ***************************************************************************/
@@ -60,11 +64,6 @@ void jent_zfree(void *ptr)
 	kfree_sensitive(ptr);
 }
 
-void jent_memcpy(void *dest, const void *src, unsigned int n)
-{
-	memcpy(dest, src, n);
-}
-
 /*
  * Obtain a high-resolution time stamp value. The time stamp is used to measure
  * the execution time of a given code path and its variations. Hence, the time
@@ -91,6 +90,91 @@ void jent_get_nstime(__u64 *out)
 	*out = tmp;
 }
 
+int jent_hash_time(void *hash_state, __u64 time, u8 *addtl,
+		   unsigned int addtl_len, __u64 hash_loop_cnt,
+		   unsigned int stuck)
+{
+	struct shash_desc *hash_state_desc = (struct shash_desc *)hash_state;
+	SHASH_DESC_ON_STACK(desc, hash_state_desc->tfm);
+	u8 intermediary[SHA3_256_DIGEST_SIZE];
+	__u64 j = 0;
+	int ret;
+
+	desc->tfm = hash_state_desc->tfm;
+
+	if (sizeof(intermediary) != crypto_shash_digestsize(desc->tfm)) {
+		pr_warn_ratelimited("Unexpected digest size\n");
+		return -EINVAL;
+	}
+
+	/*
+	 * This loop fills a buffer which is injected into the entropy pool.
+	 * The main reason for this loop is to execute something over which we
+	 * can perform a timing measurement. The injection of the resulting
+	 * data into the pool is performed to ensure the result is used and
+	 * the compiler cannot optimize the loop away in case the result is not
+	 * used at all. Yet that data is considered "additional information"
+	 * considering the terminology from SP800-90A without any entropy.
+	 *
+	 * Note, it does not matter which or how much data you inject, we are
+	 * interested in one Keccack1600 compression operation performed with
+	 * the crypto_shash_final.
+	 */
+	for (j = 0; j < hash_loop_cnt; j++) {
+		ret = crypto_shash_init(desc) ?:
+		      crypto_shash_update(desc, intermediary,
+					  sizeof(intermediary)) ?:
+		      crypto_shash_finup(desc, addtl, addtl_len, intermediary);
+		if (ret)
+			goto err;
+	}
+
+	/*
+	 * Inject the data from the previous loop into the pool. This data is
+	 * not considered to contain any entropy, but it stirs the pool a bit.
+	 */
+	ret = crypto_shash_update(desc, intermediary, sizeof(intermediary));
+	if (ret)
+		goto err;
+
+	/*
+	 * Insert the time stamp into the hash context representing the pool.
+	 *
+	 * If the time stamp is stuck, do not finally insert the value into the
+	 * entropy pool. Although this operation should not do any harm even
+	 * when the time stamp has no entropy, SP800-90B requires that any
+	 * conditioning operation to have an identical amount of input data
+	 * according to section 3.1.5.
+	 */
+	if (!stuck) {
+		ret = crypto_shash_update(hash_state_desc, (u8 *)&time,
+					  sizeof(__u64));
+	}
+
+err:
+	shash_desc_zero(desc);
+	memzero_explicit(intermediary, sizeof(intermediary));
+
+	return ret;
+}
+
+int jent_read_random_block(void *hash_state, char *dst, unsigned int dst_len)
+{
+	struct shash_desc *hash_state_desc = (struct shash_desc *)hash_state;
+	u8 jent_block[SHA3_256_DIGEST_SIZE];
+	/* Obtain data from entropy pool and re-initialize it */
+	int ret = crypto_shash_final(hash_state_desc, jent_block) ?:
+		  crypto_shash_init(hash_state_desc) ?:
+		  crypto_shash_update(hash_state_desc, jent_block,
+				      sizeof(jent_block));
+
+	if (!ret && dst_len)
+		memcpy(dst, jent_block, dst_len);
+
+	memzero_explicit(jent_block, sizeof(jent_block));
+	return ret;
+}
+
 /***************************************************************************
  * Kernel crypto API interface
  ***************************************************************************/
@@ -98,32 +182,82 @@ void jent_get_nstime(__u64 *out)
 struct jitterentropy {
 	spinlock_t jent_lock;
 	struct rand_data *entropy_collector;
+	struct crypto_shash *tfm;
+	struct shash_desc *sdesc;
 };
 
-static int jent_kcapi_init(struct crypto_tfm *tfm)
+static void jent_kcapi_cleanup(struct crypto_tfm *tfm)
 {
 	struct jitterentropy *rng = crypto_tfm_ctx(tfm);
-	int ret = 0;
 
-	rng->entropy_collector = jent_entropy_collector_alloc(1, 0);
-	if (!rng->entropy_collector)
-		ret = -ENOMEM;
+	spin_lock(&rng->jent_lock);
 
-	spin_lock_init(&rng->jent_lock);
-	return ret;
-}
+	if (rng->sdesc) {
+		shash_desc_zero(rng->sdesc);
+		kfree(rng->sdesc);
+	}
+	rng->sdesc = NULL;
 
-static void jent_kcapi_cleanup(struct crypto_tfm *tfm)
-{
-	struct jitterentropy *rng = crypto_tfm_ctx(tfm);
+	if (rng->tfm)
+		crypto_free_shash(rng->tfm);
+	rng->tfm = NULL;
 
-	spin_lock(&rng->jent_lock);
 	if (rng->entropy_collector)
 		jent_entropy_collector_free(rng->entropy_collector);
 	rng->entropy_collector = NULL;
 	spin_unlock(&rng->jent_lock);
 }
 
+static int jent_kcapi_init(struct crypto_tfm *tfm)
+{
+	struct jitterentropy *rng = crypto_tfm_ctx(tfm);
+	struct crypto_shash *hash;
+	struct shash_desc *sdesc;
+	int size, ret = 0;
+
+	spin_lock_init(&rng->jent_lock);
+
+	/*
+	 * Use SHA3-256 as conditioner. We allocate only the generic
+	 * implementation as we are not interested in high-performance. The
+	 * execution time of the SHA3 operation is measured and adds to the
+	 * Jitter RNG's unpredictable behavior. If we have a slower hash
+	 * implementation, the execution timing variations are larger. When
+	 * using a fast implementation, we would need to call it more often
+	 * as its variations are lower.
+	 */
+	hash = crypto_alloc_shash(JENT_CONDITIONING_HASH, 0, 0);
+	if (IS_ERR(hash)) {
+		pr_err("Cannot allocate conditioning digest\n");
+		return PTR_ERR(hash);
+	}
+	rng->tfm = hash;
+
+	size = sizeof(struct shash_desc) + crypto_shash_descsize(hash);
+	sdesc = kmalloc(size, GFP_KERNEL);
+	if (!sdesc) {
+		ret = -ENOMEM;
+		goto err;
+	}
+
+	sdesc->tfm = hash;
+	crypto_shash_init(sdesc);
+	rng->sdesc = sdesc;
+
+	rng->entropy_collector = jent_entropy_collector_alloc(1, 0, sdesc);
+	if (!rng->entropy_collector) {
+		ret = -ENOMEM;
+		goto err;
+	}
+
+	spin_lock_init(&rng->jent_lock);
+	return 0;
+
+err:
+	jent_kcapi_cleanup(tfm);
+	return ret;
+}
+
 static int jent_kcapi_random(struct crypto_rng *tfm,
 			     const u8 *src, unsigned int slen,
 			     u8 *rdata, unsigned int dlen)
@@ -180,15 +314,24 @@ static struct rng_alg jent_alg = {
 		.cra_module             = THIS_MODULE,
 		.cra_init               = jent_kcapi_init,
 		.cra_exit               = jent_kcapi_cleanup,
-
 	}
 };
 
 static int __init jent_mod_init(void)
 {
+	SHASH_DESC_ON_STACK(desc, tfm);
+	struct crypto_shash *tfm;
 	int ret = 0;
 
-	ret = jent_entropy_init();
+	tfm = crypto_alloc_shash(JENT_CONDITIONING_HASH, 0, 0);
+	if (IS_ERR(tfm))
+		return PTR_ERR(tfm);
+
+	desc->tfm = tfm;
+	crypto_shash_init(desc);
+	ret = jent_entropy_init(desc);
+	shash_desc_zero(desc);
+	crypto_free_shash(tfm);
 	if (ret) {
 		/* Handle permanent health test error */
 		if (fips_enabled)
diff --git a/crypto/jitterentropy.c b/crypto/jitterentropy.c
index 227cedfa4f0ae..5b224d3d7442e 100644
--- a/crypto/jitterentropy.c
+++ b/crypto/jitterentropy.c
@@ -2,7 +2,7 @@
  * Non-physical true random number generator based on timing jitter --
  * Jitter RNG standalone code.
  *
- * Copyright Stephan Mueller <smueller@chronox.de>, 2015 - 2020
+ * Copyright Stephan Mueller <smueller@chronox.de>, 2015 - 2023
  *
  * Design
  * ======
@@ -57,21 +57,22 @@
 typedef	unsigned long long	__u64;
 typedef	long long		__s64;
 typedef	unsigned int		__u32;
+typedef unsigned char		u8;
 #define NULL    ((void *) 0)
 
 /* The entropy pool */
 struct rand_data {
+	/* SHA3-256 is used as conditioner */
+#define DATA_SIZE_BITS 256
 	/* all data values that are vital to maintain the security
 	 * of the RNG are marked as SENSITIVE. A user must not
 	 * access that information while the RNG executes its loops to
 	 * calculate the next random value. */
-	__u64 data;		/* SENSITIVE Actual random number */
-	__u64 old_data;		/* SENSITIVE Previous random number */
-	__u64 prev_time;	/* SENSITIVE Previous time stamp */
-#define DATA_SIZE_BITS ((sizeof(__u64)) * 8)
-	__u64 last_delta;	/* SENSITIVE stuck test */
-	__s64 last_delta2;	/* SENSITIVE stuck test */
-	unsigned int osr;	/* Oversample rate */
+	void *hash_state;		/* SENSITIVE hash state entropy pool */
+	__u64 prev_time;		/* SENSITIVE Previous time stamp */
+	__u64 last_delta;		/* SENSITIVE stuck test */
+	__s64 last_delta2;		/* SENSITIVE stuck test */
+	unsigned int osr;		/* Oversample rate */
 #define JENT_MEMORY_BLOCKS 64
 #define JENT_MEMORY_BLOCKSIZE 32
 #define JENT_MEMORY_ACCESSLOOPS 128
@@ -301,15 +302,13 @@ static int jent_permanent_health_failure(struct rand_data *ec)
  * an entropy collection.
  *
  * Input:
- * @ec entropy collector struct -- may be NULL
  * @bits is the number of low bits of the timer to consider
  * @min is the number of bits we shift the timer value to the right at
  *	the end to make sure we have a guaranteed minimum value
  *
  * @return Newly calculated loop counter
  */
-static __u64 jent_loop_shuffle(struct rand_data *ec,
-			       unsigned int bits, unsigned int min)
+static __u64 jent_loop_shuffle(unsigned int bits, unsigned int min)
 {
 	__u64 time = 0;
 	__u64 shuffle = 0;
@@ -317,12 +316,7 @@ static __u64 jent_loop_shuffle(struct rand_data *ec,
 	unsigned int mask = (1<<bits) - 1;
 
 	jent_get_nstime(&time);
-	/*
-	 * Mix the current state of the random number into the shuffle
-	 * calculation to balance that shuffle a bit more.
-	 */
-	if (ec)
-		time ^= ec->data;
+
 	/*
 	 * We fold the time value as much as possible to ensure that as many
 	 * bits of the time stamp are included as possible.
@@ -344,81 +338,32 @@ static __u64 jent_loop_shuffle(struct rand_data *ec,
  *			      execution time jitter
  *
  * This function injects the individual bits of the time value into the
- * entropy pool using an LFSR.
+ * entropy pool using a hash.
  *
- * The code is deliberately inefficient with respect to the bit shifting
- * and shall stay that way. This function is the root cause why the code
- * shall be compiled without optimization. This function not only acts as
- * folding operation, but this function's execution is used to measure
- * the CPU execution time jitter. Any change to the loop in this function
- * implies that careful retesting must be done.
- *
- * @ec [in] entropy collector struct
- * @time [in] time stamp to be injected
- * @loop_cnt [in] if a value not equal to 0 is set, use the given value as
- *		  number of loops to perform the folding
- * @stuck [in] Is the time stamp identified as stuck?
+ * ec [in] entropy collector
+ * time [in] time stamp to be injected
+ * stuck [in] Is the time stamp identified as stuck?
  *
  * Output:
- * updated ec->data
- *
- * @return Number of loops the folding operation is performed
+ * updated hash context in the entropy collector or error code
  */
-static void jent_lfsr_time(struct rand_data *ec, __u64 time, __u64 loop_cnt,
-			   int stuck)
+static int jent_condition_data(struct rand_data *ec, __u64 time, int stuck)
 {
-	unsigned int i;
-	__u64 j = 0;
-	__u64 new = 0;
-#define MAX_FOLD_LOOP_BIT 4
-#define MIN_FOLD_LOOP_BIT 0
-	__u64 fold_loop_cnt =
-		jent_loop_shuffle(ec, MAX_FOLD_LOOP_BIT, MIN_FOLD_LOOP_BIT);
-
-	/*
-	 * testing purposes -- allow test app to set the counter, not
-	 * needed during runtime
-	 */
-	if (loop_cnt)
-		fold_loop_cnt = loop_cnt;
-	for (j = 0; j < fold_loop_cnt; j++) {
-		new = ec->data;
-		for (i = 1; (DATA_SIZE_BITS) >= i; i++) {
-			__u64 tmp = time << (DATA_SIZE_BITS - i);
-
-			tmp = tmp >> (DATA_SIZE_BITS - 1);
-
-			/*
-			* Fibonacci LSFR with polynomial of
-			*  x^64 + x^61 + x^56 + x^31 + x^28 + x^23 + 1 which is
-			*  primitive according to
-			*   http://poincare.matf.bg.ac.rs/~ezivkovm/publications/primpol1.pdf
-			* (the shift values are the polynomial values minus one
-			* due to counting bits from 0 to 63). As the current
-			* position is always the LSB, the polynomial only needs
-			* to shift data in from the left without wrap.
-			*/
-			tmp ^= ((new >> 63) & 1);
-			tmp ^= ((new >> 60) & 1);
-			tmp ^= ((new >> 55) & 1);
-			tmp ^= ((new >> 30) & 1);
-			tmp ^= ((new >> 27) & 1);
-			tmp ^= ((new >> 22) & 1);
-			new <<= 1;
-			new ^= tmp;
-		}
-	}
-
-	/*
-	 * If the time stamp is stuck, do not finally insert the value into
-	 * the entropy pool. Although this operation should not do any harm
-	 * even when the time stamp has no entropy, SP800-90B requires that
-	 * any conditioning operation (SP800-90B considers the LFSR to be a
-	 * conditioning operation) to have an identical amount of input
-	 * data according to section 3.1.5.
-	 */
-	if (!stuck)
-		ec->data = new;
+#define SHA3_HASH_LOOP (1<<3)
+	struct {
+		int rct_count;
+		unsigned int apt_observations;
+		unsigned int apt_count;
+		unsigned int apt_base;
+	} addtl = {
+		ec->rct_count,
+		ec->apt_observations,
+		ec->apt_count,
+		ec->apt_base
+	};
+
+	return jent_hash_time(ec->hash_state, time, (u8 *)&addtl, sizeof(addtl),
+			      SHA3_HASH_LOOP, stuck);
 }
 
 /*
@@ -452,7 +397,7 @@ static void jent_memaccess(struct rand_data *ec, __u64 loop_cnt)
 #define MAX_ACC_LOOP_BIT 7
 #define MIN_ACC_LOOP_BIT 0
 	__u64 acc_loop_cnt =
-		jent_loop_shuffle(ec, MAX_ACC_LOOP_BIT, MIN_ACC_LOOP_BIT);
+		jent_loop_shuffle(MAX_ACC_LOOP_BIT, MIN_ACC_LOOP_BIT);
 
 	if (NULL == ec || NULL == ec->mem)
 		return;
@@ -520,14 +465,15 @@ static int jent_measure_jitter(struct rand_data *ec)
 	stuck = jent_stuck(ec, current_delta);
 
 	/* Now call the next noise sources which also injects the data */
-	jent_lfsr_time(ec, current_delta, 0, stuck);
+	if (jent_condition_data(ec, current_delta, stuck))
+		stuck = 1;
 
 	return stuck;
 }
 
 /*
  * Generator of one 64 bit random number
- * Function fills rand_data->data
+ * Function fills rand_data->hash_state
  *
  * @ec [in] Reference to entropy collector
  */
@@ -574,7 +520,7 @@ static void jent_gen_entropy(struct rand_data *ec)
  * @return 0 when request is fulfilled or an error
  *
  * The following error codes can occur:
- *	-1	entropy_collector is NULL
+ *	-1	entropy_collector is NULL or the generation failed
  *	-2	Intermittent health failure
  *	-3	Permanent health failure
  */
@@ -604,7 +550,7 @@ int jent_read_entropy(struct rand_data *ec, unsigned char *data,
 			 * Perform startup health tests and return permanent
 			 * error if it fails.
 			 */
-			if (jent_entropy_init())
+			if (jent_entropy_init(ec->hash_state))
 				return -3;
 
 			return -2;
@@ -614,7 +560,8 @@ int jent_read_entropy(struct rand_data *ec, unsigned char *data,
 			tocopy = (DATA_SIZE_BITS / 8);
 		else
 			tocopy = len;
-		jent_memcpy(p, &ec->data, tocopy);
+		if (jent_read_random_block(ec->hash_state, p, tocopy))
+			return -1;
 
 		len -= tocopy;
 		p += tocopy;
@@ -628,7 +575,8 @@ int jent_read_entropy(struct rand_data *ec, unsigned char *data,
  ***************************************************************************/
 
 struct rand_data *jent_entropy_collector_alloc(unsigned int osr,
-					       unsigned int flags)
+					       unsigned int flags,
+					       void *hash_state)
 {
 	struct rand_data *entropy_collector;
 
@@ -655,6 +603,8 @@ struct rand_data *jent_entropy_collector_alloc(unsigned int osr,
 		osr = 1; /* minimum sampling rate is 1 */
 	entropy_collector->osr = osr;
 
+	entropy_collector->hash_state = hash_state;
+
 	/* fill the data pad with non-zero values */
 	jent_gen_entropy(entropy_collector);
 
@@ -668,7 +618,7 @@ void jent_entropy_collector_free(struct rand_data *entropy_collector)
 	jent_zfree(entropy_collector);
 }
 
-int jent_entropy_init(void)
+int jent_entropy_init(void *hash_state)
 {
 	int i;
 	__u64 delta_sum = 0;
@@ -681,6 +631,7 @@ int jent_entropy_init(void)
 
 	/* Required for RCT */
 	ec.osr = 1;
+	ec.hash_state = hash_state;
 
 	/* We could perform statistical tests here, but the problem is
 	 * that we only have a few loop counts to do testing. These
@@ -718,7 +669,7 @@ int jent_entropy_init(void)
 		/* Invoke core entropy collection logic */
 		jent_get_nstime(&time);
 		ec.prev_time = time;
-		jent_lfsr_time(&ec, time, 0, 0);
+		jent_condition_data(&ec, time, 0);
 		jent_get_nstime(&time2);
 
 		/* test whether timer works */
diff --git a/crypto/jitterentropy.h b/crypto/jitterentropy.h
index 5cc583f6bc6b8..b3890ff26a023 100644
--- a/crypto/jitterentropy.h
+++ b/crypto/jitterentropy.h
@@ -2,14 +2,18 @@
 
 extern void *jent_zalloc(unsigned int len);
 extern void jent_zfree(void *ptr);
-extern void jent_memcpy(void *dest, const void *src, unsigned int n);
 extern void jent_get_nstime(__u64 *out);
+extern int jent_hash_time(void *hash_state, __u64 time, u8 *addtl,
+			  unsigned int addtl_len, __u64 hash_loop_cnt,
+			  unsigned int stuck);
+int jent_read_random_block(void *hash_state, char *dst, unsigned int dst_len);
 
 struct rand_data;
-extern int jent_entropy_init(void);
+extern int jent_entropy_init(void *hash_state);
 extern int jent_read_entropy(struct rand_data *ec, unsigned char *data,
 			     unsigned int len);
 
 extern struct rand_data *jent_entropy_collector_alloc(unsigned int osr,
-						      unsigned int flags);
+						      unsigned int flags,
+						      void *hash_state);
 extern void jent_entropy_collector_free(struct rand_data *entropy_collector);

From 254e14125d77e0029cf68d2d02f912d419efa16e Mon Sep 17 00:00:00 2001
From: Jeremy Allison <jallison@ciq.com>
Date: Wed, 4 Sep 2024 10:24:07 -0700
Subject: [PATCH 02/11] crypto: aead,cipher - zeroize key buffer after use

    I.G 9.7.B for FIPS 140-3 specifies that variables temporarily holding
    cryptographic information should be zeroized once they are no longer
    needed. Accomplish this by using kfree_sensitive for buffers that
    previously held the private key.

    Signed-off-by: Hailey Mothershead <hailmo@amazon.com>
    Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au>

        Back-ported from commit 23e4099bdc3c8381992f9eb975c79196d6755210
        Author: Hailey Mothershead <hailmo@amazon.com>
        Date:   Mon Apr 15 22:19:15 2024 +0000

Signed-off-by: Jeremy Allison <jallison@ciq.com>
Signed-off-by: Jonathan Maple <jmaple@ciq.com>
---
 crypto/aead.c   | 3 +--
 crypto/cipher.c | 3 +--
 2 files changed, 2 insertions(+), 4 deletions(-)

diff --git a/crypto/aead.c b/crypto/aead.c
index 16991095270d2..c4ece86c45bc4 100644
--- a/crypto/aead.c
+++ b/crypto/aead.c
@@ -35,8 +35,7 @@ static int setkey_unaligned(struct crypto_aead *tfm, const u8 *key,
 	alignbuffer = (u8 *)ALIGN((unsigned long)buffer, alignmask + 1);
 	memcpy(alignbuffer, key, keylen);
 	ret = crypto_aead_alg(tfm)->setkey(tfm, alignbuffer, keylen);
-	memset(alignbuffer, 0, keylen);
-	kfree(buffer);
+	kfree_sensitive(buffer);
 	return ret;
 }
 
diff --git a/crypto/cipher.c b/crypto/cipher.c
index b47141ed4a9f3..395f0c2fbb9ff 100644
--- a/crypto/cipher.c
+++ b/crypto/cipher.c
@@ -34,8 +34,7 @@ static int setkey_unaligned(struct crypto_cipher *tfm, const u8 *key,
 	alignbuffer = (u8 *)ALIGN((unsigned long)buffer, alignmask + 1);
 	memcpy(alignbuffer, key, keylen);
 	ret = cia->cia_setkey(crypto_cipher_tfm(tfm), alignbuffer, keylen);
-	memset(alignbuffer, 0, keylen);
-	kfree(buffer);
+	kfree_sensitive(buffer);
 	return ret;
 
 }

From 7193bf0b15ad735a5ec113f4dc35174bb4917523 Mon Sep 17 00:00:00 2001
From: Jeremy Allison <jallison@ciq.com>
Date: Thu, 29 Aug 2024 16:58:53 -0700
Subject: [PATCH 03/11] SUSE: patch: crypto-ecdh-implement-FIPS-PCT.patch

Signed-off-by: Jeremy Allison <jallison@ciq.com>
Signed-off-by: Jonathan Maple <jmaple@ciq.com>
---
 crypto/ecdh.c | 31 +++++++++++++++++++++++++++++++
 1 file changed, 31 insertions(+)

diff --git a/crypto/ecdh.c b/crypto/ecdh.c
index fe8966511e9d7..af702cfefd22f 100644
--- a/crypto/ecdh.c
+++ b/crypto/ecdh.c
@@ -10,6 +10,7 @@
 #include <crypto/kpp.h>
 #include <crypto/ecdh.h>
 #include <linux/scatterlist.h>
+#include <linux/fips.h>
 #include "ecc.h"
 
 struct ecdh_ctx {
@@ -94,6 +95,36 @@ static int ecdh_compute_value(struct kpp_request *req)
 				       ctx->private_key, public_key);
 		buf = public_key;
 		nbytes = public_key_sz;
+
+		/*
+		 * SP800-56Arev3, 5.6.2.1.4: ("Owner Assurance of
+		 * Pair-wise Consistency"): recompute the public key
+		 * and check if the results match.
+		 */
+		if (fips_enabled) {
+			u64 *public_key_pct;
+
+			if (ret < 0)
+				goto free_all;
+
+			public_key_pct = kmalloc(public_key_sz, GFP_KERNEL);
+			if (!public_key_pct) {
+				ret = -ENOMEM;
+				goto free_all;
+			}
+
+			ret = ecc_make_pub_key(ctx->curve_id, ctx->ndigits,
+					       ctx->private_key,
+					       public_key_pct);
+			if (ret < 0) {
+				kfree(public_key_pct);
+				goto free_all;
+			}
+
+			if (memcmp(public_key, public_key_pct, public_key_sz))
+				panic("ECDH PCT failed in FIPS mode");
+			kfree(public_key_pct);
+		}
 	}
 
 	if (ret < 0)

From abbe7593a85a5d485162981cd617edd1a12cda0a Mon Sep 17 00:00:00 2001
From: Joachim Vandersmissen <git@jvdsn.com>
Date: Thu, 28 Mar 2024 11:24:30 -0500
Subject: [PATCH 04/11] crypto: ecdh - explicitly zeroize private_key

private_key is overwritten with the key parameter passed in by the
caller (if present), or alternatively a newly generated private key.
However, it is possible that the caller provides a key (or the newly
generated key) which is shorter than the previous key. In that
scenario, some key material from the previous key would not be
overwritten. The easiest solution is to explicitly zeroize the entire
private_key array first.

Note that this patch slightly changes the behavior of this function:
previously, if the ecc_gen_privkey failed, the old private_key would
remain. Now, the private_key is always zeroized. This behavior is
consistent with the case where params.key is set and ecc_is_key_valid
fails.

Signed-off-by: Joachim Vandersmissen <git@jvdsn.com>
Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au>
Signed-off-by: Jonathan Maple <jmaple@ciq.com>
---
 crypto/ecdh.c | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/crypto/ecdh.c b/crypto/ecdh.c
index af702cfefd22f..85c64f1a40df2 100644
--- a/crypto/ecdh.c
+++ b/crypto/ecdh.c
@@ -34,6 +34,8 @@ static int ecdh_set_secret(struct crypto_kpp *tfm, const void *buf,
 	    params.key_size > sizeof(u64) * ctx->ndigits)
 		return -EINVAL;
 
+	memset(ctx->private_key, 0, sizeof(ctx->private_key));
+
 	if (!params.key || !params.key_size)
 		return ecc_gen_privkey(ctx->curve_id, ctx->ndigits,
 				       ctx->private_key);

From ca84d884818a8d58b0dda08b92c1939f3c10fce0 Mon Sep 17 00:00:00 2001
From: Tianjia Zhang <tianjia.zhang@linux.alibaba.com>
Date: Thu, 14 Dec 2023 11:08:34 +0800
Subject: [PATCH 05/11] crypto: lib/mpi - Fix unexpected pointer access in
 mpi_ec_init

[ Upstream commit ba3c5574203034781ac4231acf117da917efcd2a ]

When the mpi_ec_ctx structure is initialized, some fields are not
cleared, causing a crash when referencing the field when the
structure was released. Initially, this issue was ignored because
memory for mpi_ec_ctx is allocated with the __GFP_ZERO flag.
For example, this error will be triggered when calculating the
Za value for SM2 separately.

Fixes: d58bb7e55a8a ("lib/mpi: Introduce ec implementation to MPI library")
Cc: stable@vger.kernel.org # v6.5
Signed-off-by: Tianjia Zhang <tianjia.zhang@linux.alibaba.com>
Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au>
Signed-off-by: Sasha Levin <sashal@kernel.org>
Signed-off-by: Jonathan Maple <jmaple@ciq.com>
---
 lib/mpi/ec.c | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/lib/mpi/ec.c b/lib/mpi/ec.c
index 40f5908e57a4f..e16dca1e23d52 100644
--- a/lib/mpi/ec.c
+++ b/lib/mpi/ec.c
@@ -584,6 +584,9 @@ void mpi_ec_init(struct mpi_ec_ctx *ctx, enum gcry_mpi_ec_models model,
 	ctx->a = mpi_copy(a);
 	ctx->b = mpi_copy(b);
 
+	ctx->d = NULL;
+	ctx->t.two_inv_p = NULL;
+
 	ctx->t.p_barrett = use_barrett > 0 ? mpi_barrett_init(ctx->p, 0) : NULL;
 
 	mpi_ec_get_reset(ctx);

From 47ede346d221e657feb8e0c01a7891d3b4a3583b Mon Sep 17 00:00:00 2001
From: Jason Rodriguez <jrodriguez@ciq.com>
Date: Mon, 30 Sep 2024 12:57:14 -0400
Subject: [PATCH 06/11] In essiv_aead_setkey(), use the same logic as
 crypto_authenc_esn_setkey() to zeroize keys on exit. converting ws

Signed-off-by: Jonathan Maple <jmaple@ciq.com>
---
 crypto/essiv.c | 9 ++++++---
 1 file changed, 6 insertions(+), 3 deletions(-)

diff --git a/crypto/essiv.c b/crypto/essiv.c
index 8bcc5bdcb2a95..ec81bdea25631 100644
--- a/crypto/essiv.c
+++ b/crypto/essiv.c
@@ -114,13 +114,16 @@ static int essiv_aead_setkey(struct crypto_aead *tfm, const u8 *key,
 	      crypto_shash_update(desc, keys.enckey, keys.enckeylen) ?:
 	      crypto_shash_finup(desc, keys.authkey, keys.authkeylen, salt);
 	if (err)
-		return err;
+		goto out;
 
 	crypto_cipher_clear_flags(tctx->essiv_cipher, CRYPTO_TFM_REQ_MASK);
 	crypto_cipher_set_flags(tctx->essiv_cipher, crypto_aead_get_flags(tfm) &
 						    CRYPTO_TFM_REQ_MASK);
-	return crypto_cipher_setkey(tctx->essiv_cipher, salt,
-				    crypto_shash_digestsize(tctx->hash));
+	err = crypto_cipher_setkey(tctx->essiv_cipher, salt,
+				   crypto_shash_digestsize(tctx->hash));
+out:
+	memzero_explicit(&keys, sizeof(keys));
+	return err;
 }
 
 static int essiv_aead_setauthsize(struct crypto_aead *tfm,

From 40e07be5e01ab954a1f03860517ff3f6c696d2a8 Mon Sep 17 00:00:00 2001
From: Sultan Alsawaf <sultan@ciq.com>
Date: Wed, 11 Jun 2025 14:16:35 -0700
Subject: [PATCH 07/11] crypto: drbg - Align buffers to at least a cache line

None of the ciphers used by the DRBG have an alignment requirement; thus,
they all return 0 from .crypto_init, resulting in inconsistent alignment
across all buffers.

Align all buffers to at least a cache line to improve performance. This is
especially useful when multiple DRBG instances are used, since it prevents
false sharing of cache lines between the different instances.

Signed-off-by: Sultan Alsawaf <sultan@ciq.com>

Signed-off-by: Jonathan Maple <jmaple@ciq.com>
---
 crypto/drbg.c | 6 ++++++
 1 file changed, 6 insertions(+)

diff --git a/crypto/drbg.c b/crypto/drbg.c
index accf425de57f7..d14cc09b5d399 100644
--- a/crypto/drbg.c
+++ b/crypto/drbg.c
@@ -1283,6 +1283,12 @@ static inline int drbg_alloc_state(struct drbg_state *drbg)
 	if (ret < 0)
 		goto err;
 
+	/*
+	 * Align to at least a cache line for better performance. This also
+	 * prevents false sharing of cache lines between different instances.
+	 */
+	ret = max(ret, L1_CACHE_BYTES - 1);
+
 	drbg->Vbuf = kmalloc(drbg_statelen(drbg) + ret, GFP_KERNEL);
 	if (!drbg->Vbuf) {
 		ret = -ENOMEM;

From 2dbc32693082f737ac747ce5bfbeb321a98d8e83 Mon Sep 17 00:00:00 2001
From: Sultan Alsawaf <sultan@ciq.com>
Date: Wed, 18 Jun 2025 23:42:08 -0700
Subject: [PATCH 08/11] mm/gup: introduce pin_user_pages_fast_only()

Like pin_user_pages_fast(), but with the internal-only FOLL_FAST_ONLY flag.

This complements the get_user_pages*() API, which already has
get_user_pages_fast_only().

Signed-off-by: Sultan Alsawaf <sultan@ciq.com>

Signed-off-by: Jonathan Maple <jmaple@ciq.com>
---
 include/linux/mm.h |  2 ++
 mm/gup.c           | 28 ++++++++++++++++++++++++++++
 2 files changed, 30 insertions(+)

diff --git a/include/linux/mm.h b/include/linux/mm.h
index 196c481ec1603..b640f479624c1 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -2544,6 +2544,8 @@ int get_user_pages_fast(unsigned long start, int nr_pages,
 			unsigned int gup_flags, struct page **pages);
 int pin_user_pages_fast(unsigned long start, int nr_pages,
 			unsigned int gup_flags, struct page **pages);
+int pin_user_pages_fast_only(unsigned long start, int nr_pages,
+			     unsigned int gup_flags, struct page **pages);
 void folio_add_pin(struct folio *folio);
 
 int account_locked_vm(struct mm_struct *mm, unsigned long pages, bool inc);
diff --git a/mm/gup.c b/mm/gup.c
index ad7345cfba91d..656f1ead20bb4 100644
--- a/mm/gup.c
+++ b/mm/gup.c
@@ -3353,6 +3353,34 @@ int pin_user_pages_fast(unsigned long start, int nr_pages,
 }
 EXPORT_SYMBOL_GPL(pin_user_pages_fast);
 
+/**
+ * pin_user_pages_fast_only() - pin user pages in memory
+ * @start:      starting user address
+ * @nr_pages:   number of pages from start to pin
+ * @gup_flags:  flags modifying pin behaviour
+ * @pages:      array that receives pointers to the pages pinned.
+ *              Should be at least nr_pages long.
+ *
+ * Like pin_user_pages_fast() except it's IRQ-safe in that it won't fall back to
+ * the regular GUP.
+ *
+ * If the architecture does not support this function, simply return with no
+ * pages pinned.
+ *
+ * Careful, careful! COW breaking can go either way, so a non-write
+ * access can get ambiguous page results. If you call this function without
+ * 'write' set, you'd better be sure that you're ok with that ambiguity.
+ */
+int pin_user_pages_fast_only(unsigned long start, int nr_pages,
+			     unsigned int gup_flags, struct page **pages)
+{
+	if (!is_valid_gup_args(pages, NULL, &gup_flags,
+			       FOLL_PIN | FOLL_FAST_ONLY))
+		return -EINVAL;
+	return internal_get_user_pages_fast(start, nr_pages, gup_flags, pages);
+}
+EXPORT_SYMBOL_GPL(pin_user_pages_fast_only);
+
 /**
  * pin_user_pages_remote() - pin pages of a remote process
  *

From f2e53737348d4487fcde5a932114586b6b580b8b Mon Sep 17 00:00:00 2001
From: Sultan Alsawaf <sultan@ciq.com>
Date: Tue, 24 Jun 2025 15:16:34 -0700
Subject: [PATCH 09/11] crypto: rng - Convert crypto_default_rng_refcnt into an
 unsigned int

There is no reason this refcount should be a signed int. Convert it to an
unsigned int, thereby also making it less likely to ever overflow.

Signed-off-by: Sultan Alsawaf <sultan@ciq.com>

Signed-off-by: Jonathan Maple <jmaple@ciq.com>
---
 crypto/rng.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/crypto/rng.c b/crypto/rng.c
index c650678106a7f..45be628f1d194 100644
--- a/crypto/rng.c
+++ b/crypto/rng.c
@@ -31,7 +31,7 @@ static struct crypto_rng *crypto_reseed_rng;
 static ____cacheline_aligned_in_smp DEFINE_MUTEX(crypto_default_rng_lock);
 struct crypto_rng *crypto_default_rng;
 EXPORT_SYMBOL_GPL(crypto_default_rng);
-static int crypto_default_rng_refcnt;
+static unsigned int crypto_default_rng_refcnt;
 
 int crypto_rng_reset(struct crypto_rng *tfm, const u8 *seed, unsigned int slen)
 {
@@ -164,7 +164,7 @@ void crypto_put_default_rng(void)
 EXPORT_SYMBOL_GPL(crypto_put_default_rng);
 
 #if defined(CONFIG_CRYPTO_RNG) || defined(CONFIG_CRYPTO_RNG_MODULE)
-static int crypto_del_rng(struct crypto_rng **rngp, int *refcntp,
+static int crypto_del_rng(struct crypto_rng **rngp, unsigned int *refcntp,
 		      struct mutex *lock)
 {
 	int err = -EBUSY;

From 28160b611374a15f24e6d8c101761c9d728a864e Mon Sep 17 00:00:00 2001
From: Sultan Alsawaf <sultan@ciq.com>
Date: Tue, 24 Jun 2025 15:31:00 -0700
Subject: [PATCH 10/11] crypto: rng - Fix priority inversions due to mutex
 locks

Since crypto_devrandom_read_iter() is invoked directly by user tasks and is
accessible by every task in the system, there are glaring priority
inversions on crypto_reseed_rng_lock and crypto_default_rng_lock.

Tasks of arbitrary scheduling priority access crypto_devrandom_read_iter().
When a low-priority task owns one of the mutex locks, higher-priority tasks
waiting on that mutex lock are stalled until the low-priority task is done.

Fix the priority inversions by converting the mutex locks into rt_mutex
locks which have PI support.

Signed-off-by: Sultan Alsawaf <sultan@ciq.com>

Signed-off-by: Jonathan Maple <jmaple@ciq.com>
---
 crypto/rng.c | 26 +++++++++++++-------------
 1 file changed, 13 insertions(+), 13 deletions(-)

diff --git a/crypto/rng.c b/crypto/rng.c
index 45be628f1d194..31691806b0037 100644
--- a/crypto/rng.c
+++ b/crypto/rng.c
@@ -14,7 +14,7 @@
 #include <linux/fips.h>
 #include <linux/kernel.h>
 #include <linux/module.h>
-#include <linux/mutex.h>
+#include <linux/rtmutex.h>
 #include <linux/random.h>
 #include <linux/seq_file.h>
 #include <linux/sched.h>
@@ -26,9 +26,9 @@
 
 #include "internal.h"
 
-static ____cacheline_aligned_in_smp DEFINE_MUTEX(crypto_reseed_rng_lock);
+static ____cacheline_aligned_in_smp DEFINE_RT_MUTEX(crypto_reseed_rng_lock);
 static struct crypto_rng *crypto_reseed_rng;
-static ____cacheline_aligned_in_smp DEFINE_MUTEX(crypto_default_rng_lock);
+static ____cacheline_aligned_in_smp DEFINE_RT_MUTEX(crypto_default_rng_lock);
 struct crypto_rng *crypto_default_rng;
 EXPORT_SYMBOL_GPL(crypto_default_rng);
 static unsigned int crypto_default_rng_refcnt;
@@ -145,11 +145,11 @@ int crypto_get_default_rng(void)
 {
 	int err;
 
-	mutex_lock(&crypto_default_rng_lock);
+	rt_mutex_lock(&crypto_default_rng_lock);
 	err = crypto_get_rng(&crypto_default_rng);
 	if (!err)
 		crypto_default_rng_refcnt++;
-	mutex_unlock(&crypto_default_rng_lock);
+	rt_mutex_unlock(&crypto_default_rng_lock);
 
 	return err;
 }
@@ -157,19 +157,19 @@ EXPORT_SYMBOL_GPL(crypto_get_default_rng);
 
 void crypto_put_default_rng(void)
 {
-	mutex_lock(&crypto_default_rng_lock);
+	rt_mutex_lock(&crypto_default_rng_lock);
 	crypto_default_rng_refcnt--;
-	mutex_unlock(&crypto_default_rng_lock);
+	rt_mutex_unlock(&crypto_default_rng_lock);
 }
 EXPORT_SYMBOL_GPL(crypto_put_default_rng);
 
 #if defined(CONFIG_CRYPTO_RNG) || defined(CONFIG_CRYPTO_RNG_MODULE)
 static int crypto_del_rng(struct crypto_rng **rngp, unsigned int *refcntp,
-		      struct mutex *lock)
+			  struct rt_mutex *lock)
 {
 	int err = -EBUSY;
 
-	mutex_lock(lock);
+	rt_mutex_lock(lock);
 	if (refcntp && *refcntp)
 		goto out;
 
@@ -179,7 +179,7 @@ static int crypto_del_rng(struct crypto_rng **rngp, unsigned int *refcntp,
 	err = 0;
 
 out:
-	mutex_unlock(lock);
+	rt_mutex_unlock(lock);
 
 	return err;
 }
@@ -264,7 +264,7 @@ static ssize_t crypto_devrandom_read_iter(struct iov_iter *iter, bool reseed)
 		 * a separate mutex (drbg->drbg_mutex) around the
 		 * reseed-and-generate operation.
 		 */
-		mutex_lock(&crypto_reseed_rng_lock);
+		rt_mutex_lock(&crypto_reseed_rng_lock);
 
 		/* If crypto_default_rng is not set, it will be seeded
 		 * at creation in __crypto_get_default_rng and thus no
@@ -275,7 +275,7 @@ static ssize_t crypto_devrandom_read_iter(struct iov_iter *iter, bool reseed)
 
 		ret = crypto_get_rng(&crypto_reseed_rng);
 		if (ret) {
-			mutex_unlock(&crypto_reseed_rng_lock);
+			rt_mutex_unlock(&crypto_reseed_rng_lock);
 			return ret;
 		}
 
@@ -314,7 +314,7 @@ static ssize_t crypto_devrandom_read_iter(struct iov_iter *iter, bool reseed)
 	}
 
 	if (reseed)
-		mutex_unlock(&crypto_reseed_rng_lock);
+		rt_mutex_unlock(&crypto_reseed_rng_lock);
 	else
 		crypto_put_default_rng();
 	memzero_explicit(tmp, sizeof(tmp));

From 51923823e82894df6a74ddda5bcef41f40f18f2a Mon Sep 17 00:00:00 2001
From: Sultan Alsawaf <sultan@ciq.com>
Date: Fri, 27 Jun 2025 19:06:18 -0700
Subject: [PATCH 11/11] crypto: rng - Implement fast per-CPU DRBG instances

When the kernel is booted with fips=1, the RNG exposed to userspace is
hijacked away from the CRNG and redirects to crypto_devrandom_read_iter(),
which utilizes the DRBG.

Notably, crypto_devrandom_read_iter() maintains just two global DRBG
instances _for the entire system_, and the two instances serve separate
request types: one instance for GRND_RANDOM requests (crypto_reseed_rng),
and one instance for non-GRND_RANDOM requests (crypto_default_rng). So in
essence, for requests of a single type, there is just one global RNG for
all CPUs in the entire system, which scales _very_ poorly.

To make matters worse, the temporary buffer used to ferry data between the
DRBG and userspace is woefully small at only 256 bytes, which doesn't do a
good job of maximizing throughput from the DRBG. This results in lost
performance when userspace requests >256 bytes; it is observed that DRBG
throughput improves by 70% on an i9-13900H when the buffer size is
increased to 4096 bytes (one page). Going beyond the size of one page up to
the DRBG maximum request limit of 65536 bytes produces diminishing returns
of only 3% improved throughput in comparison. And going below the size of
one page produces progressively less throughput at each power of 2: there's
a 5% loss going from 4096 bytes to 2048 bytes and a 9% loss going from 2048
bytes to 1024 bytes.

Thus, this implements per-CPU DRBG instances utilizing a page-sized buffer
for each CPU to utilize the DRBG itself more effectively. On top of that,
for non-GRND_RANDOM requests, the DRBG's operations now occur under a local
lock that disables preemption on non-PREEMPT_RT kernels, which not only
keeps each CPU's DRBG instance isolated from another, but also improves
temporal cache locality while the DRBG actively generates a new string of
random bytes.

Prefaulting one user destination page at a time is also employed to prevent
a DRBG instance from getting blocked on page faults, thereby maximizing the
use of the DRBG so that the only bottleneck is the DRBG itself.

Signed-off-by: Sultan Alsawaf <sultan@ciq.com>
Signed-off-by: Jonathan Maple <jmaple@ciq.com>
---
 crypto/rng.c | 514 +++++++++++++++++++++++++++++++++++++++++++--------
 1 file changed, 440 insertions(+), 74 deletions(-)

diff --git a/crypto/rng.c b/crypto/rng.c
index 31691806b0037..b5ab31ef7db9a 100644
--- a/crypto/rng.c
+++ b/crypto/rng.c
@@ -6,6 +6,9 @@
  *
  * Copyright (c) 2008 Neil Horman <nhorman@tuxdriver.com>
  * Copyright (c) 2015 Herbert Xu <herbert@gondor.apana.org.au>
+ *
+ * Copyright (C) 2025 Ctrl IQ, Inc.
+ * Author: Sultan Alsawaf <sultan@ciq.com>
  */
 
 #include <linux/atomic.h>
@@ -26,13 +29,39 @@
 
 #include "internal.h"
 
-static ____cacheline_aligned_in_smp DEFINE_RT_MUTEX(crypto_reseed_rng_lock);
-static struct crypto_rng *crypto_reseed_rng;
 static ____cacheline_aligned_in_smp DEFINE_RT_MUTEX(crypto_default_rng_lock);
 struct crypto_rng *crypto_default_rng;
 EXPORT_SYMBOL_GPL(crypto_default_rng);
 static unsigned int crypto_default_rng_refcnt;
 
+/*
+ * Per-CPU RNG instances are only used by crypto_devrandom_rng. The global RNG,
+ * crypto_default_rng, is only used directly by other drivers.
+ *
+ * Per-CPU instances of the DRBG are efficient because the DRBG itself supports
+ * an arbitrary number of instances and can be seeded on a per-CPU basis.
+ *
+ * Specifically, the DRBG is seeded by the CRNG and the Jitter RNG. The CRNG is
+ * globally accessible and is already per-CPU. And while the Jitter RNG _isn't_
+ * per-CPU, creating a DRBG instance also creates a Jitter RNG instance;
+ * therefore, per-CPU DRBG instances implies per-CPU Jitter RNG instances.
+ */
+struct cpu_rng_inst {
+	local_lock_t lock;
+	struct rt_mutex mlock;
+	struct crypto_rng *rng;
+	void *page;
+};
+
+static DEFINE_PER_CPU_ALIGNED(struct cpu_rng_inst, pcpu_default_rng) = {
+	.lock = INIT_LOCAL_LOCK(pcpu_default_rng.lock),
+	.mlock = __RT_MUTEX_INITIALIZER(pcpu_default_rng.mlock)
+};
+static DEFINE_PER_CPU_ALIGNED(struct cpu_rng_inst, pcpu_reseed_rng) = {
+	/* The reseed instances don't use the local lock */
+	.mlock = __RT_MUTEX_INITIALIZER(pcpu_reseed_rng.mlock)
+};
+
 int crypto_rng_reset(struct crypto_rng *tfm, const u8 *seed, unsigned int slen)
 {
 	struct crypto_alg *alg = tfm->base.__crt_alg;
@@ -164,34 +193,54 @@ void crypto_put_default_rng(void)
 EXPORT_SYMBOL_GPL(crypto_put_default_rng);
 
 #if defined(CONFIG_CRYPTO_RNG) || defined(CONFIG_CRYPTO_RNG_MODULE)
-static int crypto_del_rng(struct crypto_rng **rngp, unsigned int *refcntp,
-			  struct rt_mutex *lock)
-{
-	int err = -EBUSY;
-
-	rt_mutex_lock(lock);
-	if (refcntp && *refcntp)
-		goto out;
-
-	crypto_free_rng(*rngp);
-	*rngp = NULL;
+#define down_read_del_pcpu_rwsem() down_read(&del_pcpu_rwsem)
+#define up_read_del_pcpu_rwsem() up_read(&del_pcpu_rwsem)
+static DECLARE_RWSEM(del_pcpu_rwsem);
 
-	err = 0;
+static void crypto_del_pcpu_rng(struct cpu_rng_inst __percpu *pcri)
+{
+	int cpu;
 
-out:
-	rt_mutex_unlock(lock);
+	for_each_possible_cpu(cpu) {
+		struct cpu_rng_inst *cri = per_cpu_ptr(pcri, cpu);
 
-	return err;
+		if (cri->rng) {
+			crypto_free_rng(cri->rng);
+			cri->rng = NULL;
+		}
+	}
 }
 
 int crypto_del_default_rng(void)
 {
-	return crypto_del_rng(&crypto_default_rng, &crypto_default_rng_refcnt,
-			      &crypto_default_rng_lock) ?:
-	       crypto_del_rng(&crypto_reseed_rng, NULL,
-			      &crypto_reseed_rng_lock);
+	bool busy;
+
+	rt_mutex_lock(&crypto_default_rng_lock);
+	if (!(busy = crypto_default_rng_refcnt)) {
+		crypto_free_rng(crypto_default_rng);
+		crypto_default_rng = NULL;
+	}
+	rt_mutex_unlock(&crypto_default_rng_lock);
+	if (busy)
+		return -EBUSY;
+
+	if (!down_write_trylock(&del_pcpu_rwsem))
+		return -EBUSY;
+
+	crypto_del_pcpu_rng(&pcpu_default_rng);
+	crypto_del_pcpu_rng(&pcpu_reseed_rng);
+	up_write(&del_pcpu_rwsem);
+
+	return 0;
 }
 EXPORT_SYMBOL_GPL(crypto_del_default_rng);
+#else
+static inline void down_read_del_pcpu_rwsem(void)
+{
+}
+static inline void up_read_del_pcpu_rwsem(void)
+{
+}
 #endif
 
 int crypto_register_rng(struct rng_alg *alg)
@@ -244,80 +293,343 @@ void crypto_unregister_rngs(struct rng_alg *algs, int count)
 }
 EXPORT_SYMBOL_GPL(crypto_unregister_rngs);
 
+/*
+ * On non-PREEMPT_RT kernels, local locks disable preemption. When there's no
+ * rng allocated, one must be allocated by calling crypto_get_rng(), which can
+ * sleep. Therefore, crypto_get_rng() cannot be called under local_lock(), so if
+ * our CPU's RNG instance doesn't have an rng allocated, we drop the local lock
+ * and take a mutex lock instead. After the local lock is dropped, the current
+ * task can be freely migrated to another CPU, which means that calling
+ * local_lock() again might not result in the same instance getting locked as
+ * before. That's why this function exists: to loop on calling local_lock() and
+ * allocating an rng as needed with crypto_get_rng() until the current CPU's
+ * instance is found to have an rng allocated. If crypto_get_rng() ever fails,
+ * this function returns an error even if there are instances for other CPUs
+ * which _do_ have an rng allocated.
+ */
+static __always_inline struct cpu_rng_inst *
+lock_default_rng(struct crypto_rng **rng) __acquires(&cri->lock)
+{
+	struct cpu_rng_inst __percpu *pcri = &pcpu_default_rng;
+	struct cpu_rng_inst *cri;
+	int ret;
+
+	while (1) {
+		local_lock(&pcri->lock);
+		cri = this_cpu_ptr(pcri);
+		/*
+		 * cri->rng may have transitioned from non-NULL to NULL, but
+		 * underneath down_read_del_pcpu_rwsem() it can only transition
+		 * from NULL to non-NULL. This may occur on a different CPU,
+		 * thus cri->rng must be read atomically to prevent data races;
+		 * this elides mlock by pairing with the WRITE_ONCE() in the
+		 * slow path below.
+		 *
+		 * And if cri->rng is non-NULL, then it is good to go. To avoid
+		 * data races due to load speculation on torn cri->rng loads
+		 * _after_ the NULL check, one of the following is required:
+		 * 	1. smp_acquire__after_ctrl_dep() in the if-statement
+		 * 	2. All cri->rng reads are performed with READ_ONCE()
+		 * 	3. cri->rng is never read again outside this function
+		 *
+		 * Option #3 yields the best performance, so this function
+		 * provides the rng pointer as an output for the caller to use.
+		 */
+		*rng = READ_ONCE(cri->rng);
+		if (likely(*rng))
+			return cri;
+
+		/*
+		 * Slow path: there's no rng currently allocated to this instance.
+		 * Release the local lock and acquire this instance's mlock to
+		 * perform the allocation.
+		 *
+		 * Note that this task may be migrated to a different CPU now!
+		 */
+		local_unlock(&cri->lock);
+		rt_mutex_lock(&cri->mlock);
+		if (!cri->rng) {
+			struct crypto_rng *new_rng = NULL;
+
+			ret = crypto_get_rng(&new_rng);
+			if (ret) {
+				rt_mutex_unlock(&cri->mlock);
+				break;
+			}
+
+			/*
+			 * Pairs with READ_ONCE() above, because we might not be
+			 * on the same CPU anymore as when we first got `cri`.
+			 */
+			WRITE_ONCE(cri->rng, new_rng);
+		}
+		rt_mutex_unlock(&cri->mlock);
+	}
+
+	/*
+	 * Even if this task got migrated to another CPU that _does_ have an rng
+	 * allocated, just bail out if crypto_get_rng() ever fails in order to
+	 * avoid looping forever.
+	 */
+	return ERR_PTR(ret);
+}
+
+static __always_inline struct cpu_rng_inst *
+lock_reseed_rng(struct crypto_rng **rng) __acquires(&cri->mlock)
+{
+	struct cpu_rng_inst __percpu *pcri = &pcpu_reseed_rng;
+	struct cpu_rng_inst *cri;
+	int ret;
+
+	/*
+	 * Use whichever CPU this task is currently running on, knowing full
+	 * well that the task can freely migrate to other CPUs. The reseed RNG
+	 * requires holding a lock across the entire devrandom read, so that
+	 * another task cannot extract entropy from the same seed. In other
+	 * words, when reseeding is requested, reseeding must be done every time
+	 * every time mlock is acquired.
+	 */
+	cri = raw_cpu_ptr(pcri);
+	rt_mutex_lock(&cri->mlock);
+	if (likely(cri->rng)) {
+               /*
+		* Since this rng instance wasn't just allocated, it needs to be
+		* explicitly reseeded. New rng instances are seeded on creation
+		* in crypto_get_rng() and thus don't need explicit reseeding.
+                */
+               crypto_tfm_set_flags(crypto_rng_tfm(cri->rng),
+                                    CRYPTO_TFM_REQ_NEED_RESEED);
+	} else {
+		ret = crypto_get_rng(&cri->rng);
+		if (ret) {
+			rt_mutex_unlock(&cri->mlock);
+			return ERR_PTR(ret);
+		}
+	}
+
+	*rng = cri->rng;
+	return cri;
+}
+
+#define lock_local_rng(rng, reseed) \
+	({ (reseed) ? lock_reseed_rng(rng) : lock_default_rng(rng); })
+
+#define unlock_local_rng(cri, reseed) \
+do {						\
+	if (reseed)				\
+		rt_mutex_unlock(&(cri)->mlock);	\
+	else					\
+		local_unlock(&(cri)->lock);	\
+} while (0)
+
+static __always_inline void
+clear_rng_page(struct cpu_rng_inst *cri, size_t count)
+{
+	/* For zeroing a whole page, clear_page() is faster than memset() */
+	count < PAGE_SIZE ? memset(cri->page, 0, count) : clear_page(cri->page);
+}
+
 static ssize_t crypto_devrandom_read_iter(struct iov_iter *iter, bool reseed)
 {
+	/* lock_local_rng() puts us in atomic context for !reseed on non-RT */
+	const bool atomic = !reseed && !IS_ENABLED(CONFIG_PREEMPT_RT);
+	const bool user_no_reseed = !reseed && user_backed_iter(iter);
+	size_t ulen, page_dirty_len = 0;
+	struct cpu_rng_inst *cri;
 	struct crypto_rng *rng;
-	u8 tmp[256];
-	ssize_t ret;
+	void __user *uaddr;
+	struct page *upage;
+	ssize_t ret = 0;
 
 	if (unlikely(!iov_iter_count(iter)))
 		return 0;
 
-	if (reseed) {
-		u32 flags = 0;
-
-		/* If reseeding is requested, acquire a lock on
-		 * crypto_reseed_rng so it is not swapped out until
-		 * the initial random bytes are generated.
-		 *
-		 * The algorithm implementation is also protected with
-		 * a separate mutex (drbg->drbg_mutex) around the
-		 * reseed-and-generate operation.
-		 */
-		rt_mutex_lock(&crypto_reseed_rng_lock);
-
-		/* If crypto_default_rng is not set, it will be seeded
-		 * at creation in __crypto_get_default_rng and thus no
-		 * reseeding is needed.
-		 */
-		if (crypto_reseed_rng)
-			flags |= CRYPTO_TFM_REQ_NEED_RESEED;
-
-		ret = crypto_get_rng(&crypto_reseed_rng);
-		if (ret) {
-			rt_mutex_unlock(&crypto_reseed_rng_lock);
-			return ret;
+	/* Set up the starting user destination address and length */
+	if (user_no_reseed) {
+		if (iter_is_ubuf(iter)) {
+			uaddr = iter->ubuf + iter->iov_offset;
+			ulen = iov_iter_count(iter);
+		} else if (iter_is_iovec(iter)) {
+			uaddr = iter_iov_addr(iter);
+			ulen = iter_iov_len(iter);
+		} else {
+			/*
+			 * ITER_UBUF and ITER_IOVEC are the only user-backed
+			 * iters. Bug out if a new user-backed iter appears.
+			 */
+			BUG();
 		}
+	}
 
-		rng = crypto_reseed_rng;
-		crypto_tfm_set_flags(crypto_rng_tfm(rng), flags);
-	} else {
-		ret = crypto_get_default_rng();
-		if (ret)
-			return ret;
-		rng = crypto_default_rng;
+	/* Prevent rngs from getting deleted from per-CPU RNG instances */
+	down_read_del_pcpu_rwsem();
+restart:
+	/*
+	 * Pin the user page backing the current user destination address,
+	 * potentially prefaulting to allocate a page for the destination. By
+	 * prefaulting without the RNG lock held, the DRBG won't be blocked by
+	 * time spent on page faults for this task, and thus the DRBG can still
+	 * be used by other tasks.
+	 */
+	if (user_no_reseed && pin_user_pages_fast((unsigned long)uaddr, 1,
+						  FOLL_WRITE, &upage) != 1)
+		goto up_rwsem;
+
+	cri = lock_local_rng(&rng, reseed);
+	if (IS_ERR(cri)) {
+		if (!ret)
+			ret = PTR_ERR(cri);
+		goto unpin_upage;
 	}
 
-	for (;;) {
-		size_t i, copied;
+	while (1) {
+		size_t copied, i = min(iov_iter_count(iter), PAGE_SIZE);
+		bool resched_without_lock = false;
 		int err;
 
-		i = min_t(size_t, iov_iter_count(iter), sizeof(tmp));
-		err = crypto_rng_get_bytes(rng, tmp, i);
+		/*
+		 * Generate up to one page at a time, and align to a page
+		 * boundary so we only need to pin one user page at a time.
+		 */
+		if (user_no_reseed)
+			i = min3(i, PAGE_SIZE - offset_in_page(uaddr), ulen);
+
+		/*
+		 * On non-PREEMPT_RT kernels, local locks disable preemption.
+		 * The DRBG's generate() function has a mutex lock, which could
+		 * mean that we'll schedule while atomic if the mutex lock
+		 * sleeps. However, that will never happen if we ensure that
+		 * there's never any contention on the DRBG's mutex lock while
+		 * we're atomic! Our local lock ensures calls to the DRBG are
+		 * always serialized, so there's no contention from here. And
+		 * the DRBG only uses its mutex lock from one other path, when
+		 * an instance of the DRBG is freshly allocated, which we only
+		 * do from crypto_get_rng(). So the DRBG's mutex lock is
+		 * guaranteed to not have contention when we call generate() and
+		 * thus it'll never sleep here. And of course, nothing else in
+		 * generate() ever sleeps.
+		 */
+		err = crypto_rng_get_bytes(rng, cri->page, i);
 		if (err) {
-			ret = ret ?: err;
+			if (!ret)
+				ret = err;
 			break;
 		}
 
-		copied = copy_to_iter(tmp, i, iter);
-		ret += copied;
+		/*
+		 * Record the number of bytes used in cri->page and either copy
+		 * directly to the user address without faulting, or copy to the
+		 * iter which is always backed by kernel memory when !reseed &&
+		 * !user_backed_iter(). When reseed == true, the iter may be
+		 * backed by user memory, but we copy to it with the possibility
+		 * of page faults anyway because we need to hold the lock across
+		 * the entire call; this is why a mutex is used instead of a
+		 * local lock for the reseed RNG, to permit sleeping without
+		 * yielding the DRBG instance.
+		 */
+		page_dirty_len = max(i, page_dirty_len);
+		if (user_no_reseed) {
+			err = copy_to_user_nofault(uaddr, cri->page, i);
+			if (err >= 0) {
+				iov_iter_advance(iter, i - err);
+				ret += i - err;
+			}
+			if (err)
+				break;
+		} else {
+			/*
+			 * We know that copying from cri->page is safe, so use
+			 * _copy_to_iter() directly to skip check_copy_size().
+			 */
+			copied = _copy_to_iter(cri->page, i, iter);
+			ret += copied;
+			if (copied != i)
+				break;
+		}
 
-		if (!iov_iter_count(iter) || copied != i)
+		/*
+		 * Quit when either the requested number of bytes have been
+		 * generated or there is a pending signal.
+		 */
+		if (!iov_iter_count(iter) || signal_pending(current))
 			break;
 
-		BUILD_BUG_ON(PAGE_SIZE % sizeof(tmp) != 0);
-		if (ret % PAGE_SIZE == 0) {
-			if (signal_pending(current))
-				break;
-			cond_resched();
+		/* Compute the next user destination address and length */
+		if (user_no_reseed) {
+			ulen -= i;
+			if (likely(ulen)) {
+				uaddr += i;
+			} else {
+				/*
+				 * This path is only reachable by ITER_IOVEC
+				 * because ulen is initialized to the request
+				 * size for ITER_UBUF, and therefore ITER_UBUF
+				 * will always quit at the iov_iter_count()
+				 * check above before ulen can become zero.
+				 *
+				 * iter->iov_offset is guaranteed to be zero
+				 * here, so iter_iov_{addr|len}() isn't needed.
+				 */
+				uaddr = iter_iov(iter)->iov_base;
+				ulen = iter_iov(iter)->iov_len;
+			}
+
+			unpin_user_page(upage);
+		}
+
+		/*
+		 * Reschedule right now if needed and we're not atomic. If we're
+		 * atomic, then we must first drop the lock to reschedule.
+		 */
+		if (need_resched()) {
+			if (atomic)
+				resched_without_lock = true;
+			else
+				cond_resched();
+		}
+
+		/*
+		 * Optimistically try to pin the next user page without
+		 * faulting, so we don't need to clear cri->page and drop the
+		 * lock on every iteration. If this fails, we fall back to
+		 * pinning with the option to prefault.
+		 */
+		if (user_no_reseed && !resched_without_lock &&
+		    pin_user_pages_fast_only((unsigned long)uaddr, 1,
+					     FOLL_WRITE, &upage) == 1)
+			continue;
+
+		/*
+		 * Restart if either rescheduling is needed (and requires
+		 * dropping the lock since we're atomic) or the optimistic page
+		 * pinning attempt failed.
+		 *
+		 * This always implies `reseed == false`, so unlock_local_rng()
+		 * can just be passed `false` for reseed to eliminate a branch.
+		 */
+		if (resched_without_lock || user_no_reseed) {
+			/*
+			 * Clear the buffer of our latest random bytes before
+			 * unlocking and potentially migrating CPUs, in which
+			 * case we wouldn't have the same `cri` anymore.
+			 */
+			clear_rng_page(cri, page_dirty_len);
+			unlock_local_rng(cri, false);
+			page_dirty_len = 0;
+			if (resched_without_lock)
+				cond_resched();
+			goto restart;
 		}
 	}
 
-	if (reseed)
-		rt_mutex_unlock(&crypto_reseed_rng_lock);
-	else
-		crypto_put_default_rng();
-	memzero_explicit(tmp, sizeof(tmp));
+	if (page_dirty_len)
+		clear_rng_page(cri, page_dirty_len);
+	unlock_local_rng(cri, reseed);
+unpin_upage:
+	if (user_no_reseed)
+		unpin_user_page(upage);
+up_rwsem:
+	up_read_del_pcpu_rwsem();
 	return ret ? ret : -EFAULT;
 }
 
@@ -326,16 +638,70 @@ static const struct random_extrng crypto_devrandom_rng = {
 	.owner = THIS_MODULE,
 };
 
+static void free_pcpu_inst(struct cpu_rng_inst __percpu *pcri)
+{
+	int cpu;
+
+	for_each_possible_cpu(cpu) {
+		struct cpu_rng_inst *cri = per_cpu_ptr(pcri, cpu);
+
+		if (cri->rng)
+			crypto_free_rng(cri->rng);
+
+		free_page((unsigned long)cri->page);
+	}
+}
+
+static int __init alloc_pcpu_inst(struct cpu_rng_inst __percpu *pcri)
+{
+	int cpu;
+
+	for_each_possible_cpu(cpu) {
+		struct cpu_rng_inst *cri = per_cpu_ptr(pcri, cpu);
+
+		cri->page = (void *)__get_free_page(GFP_KERNEL);
+		if (!cri->page)
+			goto err_page_alloc;
+
+		local_lock_init(&cri->lock);
+	}
+
+	return 0;
+
+err_page_alloc:
+	while (cpu--)
+		free_page((unsigned long)per_cpu_ptr(pcri, cpu)->page);
+	return -ENOMEM;
+}
+
 static int __init crypto_rng_init(void)
 {
-	if (fips_enabled)
-		random_register_extrng(&crypto_devrandom_rng);
+	int ret;
+
+	if (!fips_enabled)
+		return 0;
+
+	ret = alloc_pcpu_inst(&pcpu_default_rng);
+	if (ret)
+		return ret;
+
+	ret = alloc_pcpu_inst(&pcpu_reseed_rng);
+	if (ret)
+		goto free_pcpu_default;
+
+	random_register_extrng(&crypto_devrandom_rng);
 	return 0;
+
+free_pcpu_default:
+	free_pcpu_inst(&pcpu_default_rng);
+	return ret;
 }
 
 static void __exit crypto_rng_exit(void)
 {
 	random_unregister_extrng();
+	free_pcpu_inst(&pcpu_default_rng);
+	free_pcpu_inst(&pcpu_reseed_rng);
 }
 
 late_initcall(crypto_rng_init);