From 4414307893dc2d068b61fd93a7fd07a5032a50e8 Mon Sep 17 00:00:00 2001 From: Jonathan Maple Date: Tue, 8 Jul 2025 11:24:21 -0400 Subject: [PATCH 01/10] ipv6: mcast: extend RCU protection in igmp6_send() jira LE-3526 cve CVE-2025-21759 Rebuild_History Non-Buildable kernel-6.12.0-55.20.1.el10_0 commit-author Eric Dumazet commit 087c1faa594fa07a66933d750c0b2610aa1a2946 igmp6_send() can be called without RTNL or RCU being held. Extend RCU protection so that we can safely fetch the net pointer and avoid a potential UAF. Note that we no longer can use sock_alloc_send_skb() because ipv6.igmp_sk uses GFP_KERNEL allocations which can sleep. Instead use alloc_skb() and charge the net->ipv6.igmp_sk socket under RCU protection. Fixes: b8ad0cbc58f7 ("[NETNS][IPV6] mcast - handle several network namespace") Signed-off-by: Eric Dumazet Reviewed-by: David Ahern Reviewed-by: Kuniyuki Iwashima Link: https://patch.msgid.link/20250207135841.1948589-9-edumazet@google.com Signed-off-by: Jakub Kicinski (cherry picked from commit 087c1faa594fa07a66933d750c0b2610aa1a2946) Signed-off-by: Jonathan Maple --- net/ipv6/mcast.c | 31 +++++++++++++++---------------- 1 file changed, 15 insertions(+), 16 deletions(-) diff --git a/net/ipv6/mcast.c b/net/ipv6/mcast.c index b244dbf61d5f3..6551648512585 100644 --- a/net/ipv6/mcast.c +++ b/net/ipv6/mcast.c @@ -2122,21 +2122,21 @@ static void mld_send_cr(struct inet6_dev *idev) static void igmp6_send(struct in6_addr *addr, struct net_device *dev, int type) { - struct net *net = dev_net(dev); - struct sock *sk = net->ipv6.igmp_sk; + const struct in6_addr *snd_addr, *saddr; + int err, len, payload_len, full_len; + struct in6_addr addr_buf; struct inet6_dev *idev; struct sk_buff *skb; struct mld_msg *hdr; - const struct in6_addr *snd_addr, *saddr; - struct in6_addr addr_buf; int hlen = LL_RESERVED_SPACE(dev); int tlen = dev->needed_tailroom; - int err, len, payload_len, full_len; u8 ra[8] = { IPPROTO_ICMPV6, 0, IPV6_TLV_ROUTERALERT, 2, 0, 0, IPV6_TLV_PADN, 0 }; - struct flowi6 fl6; struct dst_entry *dst; + struct flowi6 fl6; + struct net *net; + struct sock *sk; if (type == ICMPV6_MGM_REDUCTION) snd_addr = &in6addr_linklocal_allrouters; @@ -2147,19 +2147,21 @@ static void igmp6_send(struct in6_addr *addr, struct net_device *dev, int type) payload_len = len + sizeof(ra); full_len = sizeof(struct ipv6hdr) + payload_len; - rcu_read_lock(); - IP6_INC_STATS(net, __in6_dev_get(dev), IPSTATS_MIB_OUTREQUESTS); - rcu_read_unlock(); + skb = alloc_skb(hlen + tlen + full_len, GFP_KERNEL); - skb = sock_alloc_send_skb(sk, hlen + tlen + full_len, 1, &err); + rcu_read_lock(); + net = dev_net_rcu(dev); + idev = __in6_dev_get(dev); + IP6_INC_STATS(net, idev, IPSTATS_MIB_OUTREQUESTS); if (!skb) { - rcu_read_lock(); - IP6_INC_STATS(net, __in6_dev_get(dev), - IPSTATS_MIB_OUTDISCARDS); + IP6_INC_STATS(net, idev, IPSTATS_MIB_OUTDISCARDS); rcu_read_unlock(); return; } + sk = net->ipv6.igmp_sk; + skb_set_owner_w(skb, sk); + skb->priority = TC_PRIO_CONTROL; skb_reserve(skb, hlen); @@ -2184,9 +2186,6 @@ static void igmp6_send(struct in6_addr *addr, struct net_device *dev, int type) IPPROTO_ICMPV6, csum_partial(hdr, len, 0)); - rcu_read_lock(); - idev = __in6_dev_get(skb->dev); - icmpv6_flow_init(sk, &fl6, type, &ipv6_hdr(skb)->saddr, &ipv6_hdr(skb)->daddr, skb->dev->ifindex); From 2f0a5409de8ff4b6c5ac168d50416776f41ec521 Mon Sep 17 00:00:00 2001 From: Jonathan Maple Date: Tue, 8 Jul 2025 11:24:21 -0400 Subject: [PATCH 02/10] dm: Allow .prepare_ioctl to handle ioctls directly jira LE-3526 Rebuild_History Non-Buildable kernel-6.12.0-55.20.1.el10_0 commit-author Kevin Wolf commit 4862c8861d902d43645a493e441c4478be1c6c44 This adds a 'bool *forward' parameter to .prepare_ioctl, which allows device mapper targets to accept ioctls to themselves instead of the underlying device. If the target already fully handled the ioctl, it sets *forward to false and device mapper won't forward it to the underlying device any more. In order for targets to actually know what the ioctl is about and how to handle it, pass also cmd and arg. As long as targets restrict themselves to interpreting ioctls of type DM_IOCTL, this is a backwards compatible change because previously, any such ioctl would have been passed down through all device mapper layers until it reached a device that can't understand the ioctl and would return an error. Signed-off-by: Kevin Wolf Reviewed-by: Benjamin Marzinski Signed-off-by: Mikulas Patocka (cherry picked from commit 4862c8861d902d43645a493e441c4478be1c6c44) Signed-off-by: Jonathan Maple --- drivers/md/dm-dust.c | 4 +++- drivers/md/dm-ebs-target.c | 3 ++- drivers/md/dm-flakey.c | 4 +++- drivers/md/dm-linear.c | 4 +++- drivers/md/dm-log-writes.c | 4 +++- drivers/md/dm-mpath.c | 4 +++- drivers/md/dm-switch.c | 4 +++- drivers/md/dm-verity-target.c | 4 +++- drivers/md/dm-zoned-target.c | 3 ++- drivers/md/dm.c | 17 +++++++++++------ include/linux/device-mapper.h | 9 ++++++++- 11 files changed, 44 insertions(+), 16 deletions(-) diff --git a/drivers/md/dm-dust.c b/drivers/md/dm-dust.c index 1a33820c9f46a..e75310232bbfc 100644 --- a/drivers/md/dm-dust.c +++ b/drivers/md/dm-dust.c @@ -534,7 +534,9 @@ static void dust_status(struct dm_target *ti, status_type_t type, } } -static int dust_prepare_ioctl(struct dm_target *ti, struct block_device **bdev) +static int dust_prepare_ioctl(struct dm_target *ti, struct block_device **bdev, + unsigned int cmd, unsigned long arg, + bool *forward) { struct dust_device *dd = ti->private; struct dm_dev *dev = dd->dev; diff --git a/drivers/md/dm-ebs-target.c b/drivers/md/dm-ebs-target.c index ec5db1478b2fc..83f9d72e74c0f 100644 --- a/drivers/md/dm-ebs-target.c +++ b/drivers/md/dm-ebs-target.c @@ -409,7 +409,8 @@ static void ebs_status(struct dm_target *ti, status_type_t type, } } -static int ebs_prepare_ioctl(struct dm_target *ti, struct block_device **bdev) +static int ebs_prepare_ioctl(struct dm_target *ti, struct block_device **bdev, + unsigned int cmd, unsigned long arg, bool *forward) { struct ebs_c *ec = ti->private; struct dm_dev *dev = ec->dev; diff --git a/drivers/md/dm-flakey.c b/drivers/md/dm-flakey.c index b690905ab89ff..0fceb08f4622d 100644 --- a/drivers/md/dm-flakey.c +++ b/drivers/md/dm-flakey.c @@ -638,7 +638,9 @@ static void flakey_status(struct dm_target *ti, status_type_t type, } } -static int flakey_prepare_ioctl(struct dm_target *ti, struct block_device **bdev) +static int flakey_prepare_ioctl(struct dm_target *ti, struct block_device **bdev, + unsigned int cmd, unsigned long arg, + bool *forward) { struct flakey_c *fc = ti->private; diff --git a/drivers/md/dm-linear.c b/drivers/md/dm-linear.c index 49fb0f6841938..73f8386a284f5 100644 --- a/drivers/md/dm-linear.c +++ b/drivers/md/dm-linear.c @@ -119,7 +119,9 @@ static void linear_status(struct dm_target *ti, status_type_t type, } } -static int linear_prepare_ioctl(struct dm_target *ti, struct block_device **bdev) +static int linear_prepare_ioctl(struct dm_target *ti, struct block_device **bdev, + unsigned int cmd, unsigned long arg, + bool *forward) { struct linear_c *lc = ti->private; struct dm_dev *dev = lc->dev; diff --git a/drivers/md/dm-log-writes.c b/drivers/md/dm-log-writes.c index 8d7df8303d0a1..d484e8e1d48a9 100644 --- a/drivers/md/dm-log-writes.c +++ b/drivers/md/dm-log-writes.c @@ -818,7 +818,9 @@ static void log_writes_status(struct dm_target *ti, status_type_t type, } static int log_writes_prepare_ioctl(struct dm_target *ti, - struct block_device **bdev) + struct block_device **bdev, + unsigned int cmd, unsigned long arg, + bool *forward) { struct log_writes_c *lc = ti->private; struct dm_dev *dev = lc->dev; diff --git a/drivers/md/dm-mpath.c b/drivers/md/dm-mpath.c index 637977acc3dc0..368606afb6f05 100644 --- a/drivers/md/dm-mpath.c +++ b/drivers/md/dm-mpath.c @@ -2022,7 +2022,9 @@ static int multipath_message(struct dm_target *ti, unsigned int argc, char **arg } static int multipath_prepare_ioctl(struct dm_target *ti, - struct block_device **bdev) + struct block_device **bdev, + unsigned int cmd, unsigned long arg, + bool *forward) { struct multipath *m = ti->private; struct pgpath *pgpath; diff --git a/drivers/md/dm-switch.c b/drivers/md/dm-switch.c index dfd9fb52a6f33..bb1a70b5a2152 100644 --- a/drivers/md/dm-switch.c +++ b/drivers/md/dm-switch.c @@ -517,7 +517,9 @@ static void switch_status(struct dm_target *ti, status_type_t type, * * Passthrough all ioctls to the path for sector 0 */ -static int switch_prepare_ioctl(struct dm_target *ti, struct block_device **bdev) +static int switch_prepare_ioctl(struct dm_target *ti, struct block_device **bdev, + unsigned int cmd, unsigned long arg, + bool *forward) { struct switch_ctx *sctx = ti->private; unsigned int path_nr; diff --git a/drivers/md/dm-verity-target.c b/drivers/md/dm-verity-target.c index c142ec5458b70..4e11730f16f0a 100644 --- a/drivers/md/dm-verity-target.c +++ b/drivers/md/dm-verity-target.c @@ -946,7 +946,9 @@ static void verity_status(struct dm_target *ti, status_type_t type, } } -static int verity_prepare_ioctl(struct dm_target *ti, struct block_device **bdev) +static int verity_prepare_ioctl(struct dm_target *ti, struct block_device **bdev, + unsigned int cmd, unsigned long arg, + bool *forward) { struct dm_verity *v = ti->private; diff --git a/drivers/md/dm-zoned-target.c b/drivers/md/dm-zoned-target.c index 6141fc25d8421..5da3db06da103 100644 --- a/drivers/md/dm-zoned-target.c +++ b/drivers/md/dm-zoned-target.c @@ -1015,7 +1015,8 @@ static void dmz_io_hints(struct dm_target *ti, struct queue_limits *limits) /* * Pass on ioctl to the backend device. */ -static int dmz_prepare_ioctl(struct dm_target *ti, struct block_device **bdev) +static int dmz_prepare_ioctl(struct dm_target *ti, struct block_device **bdev, + unsigned int cmd, unsigned long arg, bool *forward) { struct dmz_target *dmz = ti->private; struct dmz_dev *dev = &dmz->dev[0]; diff --git a/drivers/md/dm.c b/drivers/md/dm.c index 19230404d8c2b..dd43be28ce86b 100644 --- a/drivers/md/dm.c +++ b/drivers/md/dm.c @@ -411,7 +411,8 @@ static int dm_blk_getgeo(struct block_device *bdev, struct hd_geometry *geo) } static int dm_prepare_ioctl(struct mapped_device *md, int *srcu_idx, - struct block_device **bdev) + struct block_device **bdev, unsigned int cmd, + unsigned long arg, bool *forward) { struct dm_target *ti; struct dm_table *map; @@ -434,8 +435,8 @@ static int dm_prepare_ioctl(struct mapped_device *md, int *srcu_idx, if (dm_suspended_md(md)) return -EAGAIN; - r = ti->type->prepare_ioctl(ti, bdev); - if (r == -ENOTCONN && !fatal_signal_pending(current)) { + r = ti->type->prepare_ioctl(ti, bdev, cmd, arg, forward); + if (r == -ENOTCONN && *forward && !fatal_signal_pending(current)) { dm_put_live_table(md, *srcu_idx); fsleep(10000); goto retry; @@ -454,9 +455,10 @@ static int dm_blk_ioctl(struct block_device *bdev, blk_mode_t mode, { struct mapped_device *md = bdev->bd_disk->private_data; int r, srcu_idx; + bool forward = true; - r = dm_prepare_ioctl(md, &srcu_idx, &bdev); - if (r < 0) + r = dm_prepare_ioctl(md, &srcu_idx, &bdev, cmd, arg, &forward); + if (!forward || r < 0) goto out; if (r > 0) { @@ -3576,10 +3578,13 @@ static int dm_pr_clear(struct block_device *bdev, u64 key) struct mapped_device *md = bdev->bd_disk->private_data; const struct pr_ops *ops; int r, srcu_idx; + bool forward = true; - r = dm_prepare_ioctl(md, &srcu_idx, &bdev); + /* Not a real ioctl, but targets must not interpret non-DM ioctls */ + r = dm_prepare_ioctl(md, &srcu_idx, &bdev, 0, 0, &forward); if (r < 0) goto out; + WARN_ON_ONCE(!forward); ops = bdev->bd_disk->fops->pr_ops; if (ops && ops->pr_clear) diff --git a/include/linux/device-mapper.h b/include/linux/device-mapper.h index 8321f65897f3d..4a28f77a8841a 100644 --- a/include/linux/device-mapper.h +++ b/include/linux/device-mapper.h @@ -93,7 +93,14 @@ typedef void (*dm_status_fn) (struct dm_target *ti, status_type_t status_type, typedef int (*dm_message_fn) (struct dm_target *ti, unsigned int argc, char **argv, char *result, unsigned int maxlen); -typedef int (*dm_prepare_ioctl_fn) (struct dm_target *ti, struct block_device **bdev); +/* + * Called with *forward == true. If it remains true, the ioctl should be + * forwarded to bdev. If it is reset to false, the target already fully handled + * the ioctl and the return value is the return value for the whole ioctl. + */ +typedef int (*dm_prepare_ioctl_fn) (struct dm_target *ti, struct block_device **bdev, + unsigned int cmd, unsigned long arg, + bool *forward); #ifdef CONFIG_BLK_DEV_ZONED typedef int (*dm_report_zones_fn) (struct dm_target *ti, From 97ddc3729639a450b6468d1967aefbe3a9ad11c4 Mon Sep 17 00:00:00 2001 From: Jonathan Maple Date: Tue, 8 Jul 2025 11:24:22 -0400 Subject: [PATCH 03/10] dm mpath: Interface for explicit probing of active paths jira LE-3526 Rebuild_History Non-Buildable kernel-6.12.0-55.20.1.el10_0 commit-author Kevin Wolf commit 7734fb4ad98c3fdaf0fde82978ef8638195a5285 Empty-Commit: Cherry-Pick Conflicts during history rebuild. Will be included in final tarball splat. Ref for failed cherry-pick at: ciq/ciq_backports/kernel-6.12.0-55.20.1.el10_0/7734fb4a.failed Multipath cannot directly provide failover for ioctls in the kernel because it doesn't know what each ioctl means and which result could indicate a path error. Userspace generally knows what the ioctl it issued means and if it might be a path error, but neither does it know which path the ioctl took nor does it necessarily have the privileges to fail a path using the control device. In order to allow userspace to address this situation, implement a DM_MPATH_PROBE_PATHS ioctl that prompts the dm-mpath driver to probe all active paths in the current path group to see whether they still work, and fail them if not. If this returns success, userspace can retry the ioctl and expect that the previously hit bad path is now failed (or working again). The immediate motivation for this is the use of SG_IO in QEMU for SCSI passthrough. Following a failed SG_IO ioctl, QEMU will trigger probing to ensure that all active paths are actually alive, so that retrying SG_IO at least has a lower chance of failing due to a path error. However, the problem is broader than just SG_IO (it affects any ioctl), and if applications need failover support for other ioctls, the same probing can be used. This is not implemented on the DM control device, but on the DM mpath block devices, to allow all users who have access to such a block device to make use of this interface, specifically to implement failover for ioctls. For the same reason, it is also unprivileged. Its implementation is effectively just a bunch of reads, which could already be issued by userspace, just without any guarantee that all the rights paths are selected. The probing implemented here is done fully synchronously path by path; probing all paths concurrently is left as an improvement for the future. Co-developed-by: Hanna Czenczek Signed-off-by: Hanna Czenczek Signed-off-by: Kevin Wolf Reviewed-by: Benjamin Marzinski Signed-off-by: Benjamin Marzinski Signed-off-by: Mikulas Patocka (cherry picked from commit 7734fb4ad98c3fdaf0fde82978ef8638195a5285) Signed-off-by: Jonathan Maple # Conflicts: # include/uapi/linux/dm-ioctl.h --- .../7734fb4a.failed | 216 ++++++++++++++++++ 1 file changed, 216 insertions(+) create mode 100644 ciq/ciq_backports/kernel-6.12.0-55.20.1.el10_0/7734fb4a.failed diff --git a/ciq/ciq_backports/kernel-6.12.0-55.20.1.el10_0/7734fb4a.failed b/ciq/ciq_backports/kernel-6.12.0-55.20.1.el10_0/7734fb4a.failed new file mode 100644 index 0000000000000..31e0147f73f3b --- /dev/null +++ b/ciq/ciq_backports/kernel-6.12.0-55.20.1.el10_0/7734fb4a.failed @@ -0,0 +1,216 @@ +dm mpath: Interface for explicit probing of active paths + +jira LE-3526 +Rebuild_History Non-Buildable kernel-6.12.0-55.20.1.el10_0 +commit-author Kevin Wolf +commit 7734fb4ad98c3fdaf0fde82978ef8638195a5285 +Empty-Commit: Cherry-Pick Conflicts during history rebuild. +Will be included in final tarball splat. Ref for failed cherry-pick at: +ciq/ciq_backports/kernel-6.12.0-55.20.1.el10_0/7734fb4a.failed + +Multipath cannot directly provide failover for ioctls in the kernel +because it doesn't know what each ioctl means and which result could +indicate a path error. Userspace generally knows what the ioctl it +issued means and if it might be a path error, but neither does it know +which path the ioctl took nor does it necessarily have the privileges to +fail a path using the control device. + +In order to allow userspace to address this situation, implement a +DM_MPATH_PROBE_PATHS ioctl that prompts the dm-mpath driver to probe all +active paths in the current path group to see whether they still work, +and fail them if not. If this returns success, userspace can retry the +ioctl and expect that the previously hit bad path is now failed (or +working again). + +The immediate motivation for this is the use of SG_IO in QEMU for SCSI +passthrough. Following a failed SG_IO ioctl, QEMU will trigger probing +to ensure that all active paths are actually alive, so that retrying +SG_IO at least has a lower chance of failing due to a path error. +However, the problem is broader than just SG_IO (it affects any ioctl), +and if applications need failover support for other ioctls, the same +probing can be used. + +This is not implemented on the DM control device, but on the DM mpath +block devices, to allow all users who have access to such a block device +to make use of this interface, specifically to implement failover for +ioctls. For the same reason, it is also unprivileged. Its implementation +is effectively just a bunch of reads, which could already be issued by +userspace, just without any guarantee that all the rights paths are +selected. + +The probing implemented here is done fully synchronously path by path; +probing all paths concurrently is left as an improvement for the future. + +Co-developed-by: Hanna Czenczek + Signed-off-by: Hanna Czenczek + Signed-off-by: Kevin Wolf + Reviewed-by: Benjamin Marzinski + Signed-off-by: Benjamin Marzinski + Signed-off-by: Mikulas Patocka +(cherry picked from commit 7734fb4ad98c3fdaf0fde82978ef8638195a5285) + Signed-off-by: Jonathan Maple + +# Conflicts: +# include/uapi/linux/dm-ioctl.h +diff --cc include/uapi/linux/dm-ioctl.h +index 1990b5700f69,3225e025e30e..000000000000 +--- a/include/uapi/linux/dm-ioctl.h ++++ b/include/uapi/linux/dm-ioctl.h +@@@ -285,10 -287,13 +287,19 @@@ enum + #define DM_TARGET_MSG _IOWR(DM_IOCTL, DM_TARGET_MSG_CMD, struct dm_ioctl) + #define DM_DEV_SET_GEOMETRY _IOWR(DM_IOCTL, DM_DEV_SET_GEOMETRY_CMD, struct dm_ioctl) + ++ /* Block device ioctls */ ++ #define DM_MPATH_PROBE_PATHS _IO(DM_IOCTL, DM_MPATH_PROBE_PATHS_CMD) ++ + #define DM_VERSION_MAJOR 4 +++<<<<<<< HEAD + +#define DM_VERSION_MINOR 48 + +#define DM_VERSION_PATCHLEVEL 0 + +#define DM_VERSION_EXTRA "-ioctl (2023-03-01)" +++======= ++ #define DM_VERSION_MINOR 50 ++ #define DM_VERSION_PATCHLEVEL 0 ++ #define DM_VERSION_EXTRA "-ioctl (2025-04-28)" +++>>>>>>> 7734fb4ad98c (dm mpath: Interface for explicit probing of active paths) + + /* Status bits */ + #define DM_READONLY_FLAG (1 << 0) /* In/Out */ +diff --git a/drivers/md/dm-ioctl.c b/drivers/md/dm-ioctl.c +index f299ff393a6a..d51088980bea 100644 +--- a/drivers/md/dm-ioctl.c ++++ b/drivers/md/dm-ioctl.c +@@ -1885,6 +1885,7 @@ static ioctl_fn lookup_ioctl(unsigned int cmd, int *ioctl_flags) + {DM_DEV_SET_GEOMETRY_CMD, 0, dev_set_geometry}, + {DM_DEV_ARM_POLL_CMD, IOCTL_FLAGS_NO_PARAMS, dev_arm_poll}, + {DM_GET_TARGET_VERSION_CMD, 0, get_target_version}, ++ {DM_MPATH_PROBE_PATHS_CMD, 0, NULL}, /* block device ioctl */ + }; + + if (unlikely(cmd >= ARRAY_SIZE(_ioctls))) +diff --git a/drivers/md/dm-mpath.c b/drivers/md/dm-mpath.c +index 368606afb6f0..4dca7382e6bd 100644 +--- a/drivers/md/dm-mpath.c ++++ b/drivers/md/dm-mpath.c +@@ -2021,6 +2021,94 @@ static int multipath_message(struct dm_target *ti, unsigned int argc, char **arg + return r; + } + ++/* ++ * Perform a minimal read from the given path to find out whether the ++ * path still works. If a path error occurs, fail it. ++ */ ++static int probe_path(struct pgpath *pgpath) ++{ ++ struct block_device *bdev = pgpath->path.dev->bdev; ++ unsigned int read_size = bdev_logical_block_size(bdev); ++ struct page *page; ++ struct bio *bio; ++ blk_status_t status; ++ int r = 0; ++ ++ if (WARN_ON_ONCE(read_size > PAGE_SIZE)) ++ return -EINVAL; ++ ++ page = alloc_page(GFP_KERNEL); ++ if (!page) ++ return -ENOMEM; ++ ++ /* Perform a minimal read: Sector 0, length read_size */ ++ bio = bio_alloc(bdev, 1, REQ_OP_READ, GFP_KERNEL); ++ if (!bio) { ++ r = -ENOMEM; ++ goto out; ++ } ++ ++ bio->bi_iter.bi_sector = 0; ++ __bio_add_page(bio, page, read_size, 0); ++ submit_bio_wait(bio); ++ status = bio->bi_status; ++ bio_put(bio); ++ ++ if (status && blk_path_error(status)) ++ fail_path(pgpath); ++ ++out: ++ __free_page(page); ++ return r; ++} ++ ++/* ++ * Probe all active paths in current_pg to find out whether they still work. ++ * Fail all paths that do not work. ++ * ++ * Return -ENOTCONN if no valid path is left (even outside of current_pg). We ++ * cannot probe paths in other pgs without switching current_pg, so if valid ++ * paths are only in different pgs, they may or may not work. Additionally ++ * we should not probe paths in a pathgroup that is in the process of ++ * Initializing. Userspace can submit a request and we'll switch and wait ++ * for the pathgroup to be initialized. If the request fails, it may need to ++ * probe again. ++ */ ++static int probe_active_paths(struct multipath *m) ++{ ++ struct pgpath *pgpath; ++ struct priority_group *pg; ++ unsigned long flags; ++ int r = 0; ++ ++ mutex_lock(&m->work_mutex); ++ ++ spin_lock_irqsave(&m->lock, flags); ++ if (test_bit(MPATHF_QUEUE_IO, &m->flags)) ++ pg = NULL; ++ else ++ pg = m->current_pg; ++ spin_unlock_irqrestore(&m->lock, flags); ++ ++ if (pg) { ++ list_for_each_entry(pgpath, &pg->pgpaths, list) { ++ if (!pgpath->is_active) ++ continue; ++ ++ r = probe_path(pgpath); ++ if (r < 0) ++ goto out; ++ } ++ } ++ ++ if (!atomic_read(&m->nr_valid_paths)) ++ r = -ENOTCONN; ++ ++out: ++ mutex_unlock(&m->work_mutex); ++ return r; ++} ++ + static int multipath_prepare_ioctl(struct dm_target *ti, + struct block_device **bdev, + unsigned int cmd, unsigned long arg, +@@ -2031,6 +2119,16 @@ static int multipath_prepare_ioctl(struct dm_target *ti, + unsigned long flags; + int r; + ++ if (_IOC_TYPE(cmd) == DM_IOCTL) { ++ *forward = false; ++ switch (cmd) { ++ case DM_MPATH_PROBE_PATHS: ++ return probe_active_paths(m); ++ default: ++ return -ENOTTY; ++ } ++ } ++ + pgpath = READ_ONCE(m->current_pgpath); + if (!pgpath || !mpath_double_check_test_bit(MPATHF_QUEUE_IO, m)) + pgpath = choose_pgpath(m, 0); +@@ -2182,7 +2280,7 @@ static int multipath_busy(struct dm_target *ti) + */ + static struct target_type multipath_target = { + .name = "multipath", +- .version = {1, 14, 0}, ++ .version = {1, 15, 0}, + .features = DM_TARGET_SINGLETON | DM_TARGET_IMMUTABLE | + DM_TARGET_PASSES_INTEGRITY, + .module = THIS_MODULE, +* Unmerged path include/uapi/linux/dm-ioctl.h From b97b5b69723503040ab6fb9ada3f7698b6e09417 Mon Sep 17 00:00:00 2001 From: Jonathan Maple Date: Tue, 8 Jul 2025 11:24:22 -0400 Subject: [PATCH 04/10] dm-mpath: Don't grab work_mutex while probing paths jira LE-3526 Rebuild_History Non-Buildable kernel-6.12.0-55.20.1.el10_0 commit-author Benjamin Marzinski commit 5c977f1023156938915c57d362fddde8fad2b052 Empty-Commit: Cherry-Pick Conflicts during history rebuild. Will be included in final tarball splat. Ref for failed cherry-pick at: ciq/ciq_backports/kernel-6.12.0-55.20.1.el10_0/5c977f10.failed Grabbing the work_mutex keeps probe_active_paths() from running at the same time as multipath_message(). The only messages that could interfere with probing the paths are "disable_group", "enable_group", and "switch_group". These messages could force multipath to pick a new pathgroup while probe_active_paths() was probing the current pathgroup. If the multipath device has a hardware handler, and it switches active pathgroups while there is outstanding IO to a path device, it's possible that IO to the path will fail, even if the path would be usable if it was in the active pathgroup. To avoid this, do not clear the current pathgroup for the *_group messages while probe_active_paths() is running. Instead set a flag, and probe_active_paths() will clear the current pathgroup when it finishes probing the paths. For this to work correctly, multipath needs to check current_pg before next_pg in choose_pgpath(), but before this patch next_pg was only ever set when current_pg was cleared, so this doesn't change the current behavior when paths aren't being probed. Even with this change, it is still possible to switch pathgroups while the probe is running, but only if all the paths have failed, and the probe function will skip them as well in this case. If multiple DM_MPATH_PROBE_PATHS requests are received at once, there is no point in repeatedly issuing test IOs. Instead, the later probes should wait for the current probe to complete. If current pathgroup is still the same as the one that was just checked, the other probes should skip probing and just check the number of valid paths. Finally, probing the paths should quit early if the multipath device is trying to suspend, instead of continuing to issue test IOs, delaying the suspend. While this patch will not change the behavior of existing multipath users which don't use the DM_MPATH_PROBE_PATHS ioctl, when that ioctl is used, the behavior of the "disable_group", "enable_group", and "switch_group" messages can change subtly. When these messages return, the next IO to the multipath device will no longer be guaranteed to choose a new pathgroup. Instead, choosing a new pathgroup could be delayed by an in-progress DM_MPATH_PROBE_PATHS ioctl. The userspace multipath tools make no assumptions about what will happen to IOs after sending these messages, so this change will not effect already released versions of them, even if the DM_MPATH_PROBE_PATHS ioctl is run alongside them. Signed-off-by: Benjamin Marzinski Signed-off-by: Mikulas Patocka (cherry picked from commit 5c977f1023156938915c57d362fddde8fad2b052) Signed-off-by: Jonathan Maple # Conflicts: # drivers/md/dm-mpath.c --- .../5c977f10.failed | 180 ++++++++++++++++++ 1 file changed, 180 insertions(+) create mode 100644 ciq/ciq_backports/kernel-6.12.0-55.20.1.el10_0/5c977f10.failed diff --git a/ciq/ciq_backports/kernel-6.12.0-55.20.1.el10_0/5c977f10.failed b/ciq/ciq_backports/kernel-6.12.0-55.20.1.el10_0/5c977f10.failed new file mode 100644 index 0000000000000..99d4a35f49539 --- /dev/null +++ b/ciq/ciq_backports/kernel-6.12.0-55.20.1.el10_0/5c977f10.failed @@ -0,0 +1,180 @@ +dm-mpath: Don't grab work_mutex while probing paths + +jira LE-3526 +Rebuild_History Non-Buildable kernel-6.12.0-55.20.1.el10_0 +commit-author Benjamin Marzinski +commit 5c977f1023156938915c57d362fddde8fad2b052 +Empty-Commit: Cherry-Pick Conflicts during history rebuild. +Will be included in final tarball splat. Ref for failed cherry-pick at: +ciq/ciq_backports/kernel-6.12.0-55.20.1.el10_0/5c977f10.failed + +Grabbing the work_mutex keeps probe_active_paths() from running at the +same time as multipath_message(). The only messages that could interfere +with probing the paths are "disable_group", "enable_group", and +"switch_group". These messages could force multipath to pick a new +pathgroup while probe_active_paths() was probing the current pathgroup. +If the multipath device has a hardware handler, and it switches active +pathgroups while there is outstanding IO to a path device, it's possible +that IO to the path will fail, even if the path would be usable if it +was in the active pathgroup. To avoid this, do not clear the current +pathgroup for the *_group messages while probe_active_paths() is +running. Instead set a flag, and probe_active_paths() will clear the +current pathgroup when it finishes probing the paths. For this to work +correctly, multipath needs to check current_pg before next_pg in +choose_pgpath(), but before this patch next_pg was only ever set when +current_pg was cleared, so this doesn't change the current behavior when +paths aren't being probed. Even with this change, it is still possible +to switch pathgroups while the probe is running, but only if all the +paths have failed, and the probe function will skip them as well in this +case. + +If multiple DM_MPATH_PROBE_PATHS requests are received at once, there is +no point in repeatedly issuing test IOs. Instead, the later probes +should wait for the current probe to complete. If current pathgroup is +still the same as the one that was just checked, the other probes should +skip probing and just check the number of valid paths. Finally, probing +the paths should quit early if the multipath device is trying to +suspend, instead of continuing to issue test IOs, delaying the suspend. + +While this patch will not change the behavior of existing multipath +users which don't use the DM_MPATH_PROBE_PATHS ioctl, when that ioctl +is used, the behavior of the "disable_group", "enable_group", and +"switch_group" messages can change subtly. When these messages return, +the next IO to the multipath device will no longer be guaranteed to +choose a new pathgroup. Instead, choosing a new pathgroup could be +delayed by an in-progress DM_MPATH_PROBE_PATHS ioctl. The userspace +multipath tools make no assumptions about what will happen to IOs after +sending these messages, so this change will not effect already released +versions of them, even if the DM_MPATH_PROBE_PATHS ioctl is run +alongside them. + + Signed-off-by: Benjamin Marzinski + Signed-off-by: Mikulas Patocka +(cherry picked from commit 5c977f1023156938915c57d362fddde8fad2b052) + Signed-off-by: Jonathan Maple + +# Conflicts: +# drivers/md/dm-mpath.c +diff --cc drivers/md/dm-mpath.c +index 368606afb6f0,12b7bcae490c..000000000000 +--- a/drivers/md/dm-mpath.c ++++ b/drivers/md/dm-mpath.c +@@@ -2021,6 -2039,114 +2039,117 @@@ out + return r; + } + +++<<<<<<< HEAD +++======= ++ /* ++ * Perform a minimal read from the given path to find out whether the ++ * path still works. If a path error occurs, fail it. ++ */ ++ static int probe_path(struct pgpath *pgpath) ++ { ++ struct block_device *bdev = pgpath->path.dev->bdev; ++ unsigned int read_size = bdev_logical_block_size(bdev); ++ struct page *page; ++ struct bio *bio; ++ blk_status_t status; ++ int r = 0; ++ ++ if (WARN_ON_ONCE(read_size > PAGE_SIZE)) ++ return -EINVAL; ++ ++ page = alloc_page(GFP_KERNEL); ++ if (!page) ++ return -ENOMEM; ++ ++ /* Perform a minimal read: Sector 0, length read_size */ ++ bio = bio_alloc(bdev, 1, REQ_OP_READ, GFP_KERNEL); ++ if (!bio) { ++ r = -ENOMEM; ++ goto out; ++ } ++ ++ bio->bi_iter.bi_sector = 0; ++ __bio_add_page(bio, page, read_size, 0); ++ submit_bio_wait(bio); ++ status = bio->bi_status; ++ bio_put(bio); ++ ++ if (status && blk_path_error(status)) ++ fail_path(pgpath); ++ ++ out: ++ __free_page(page); ++ return r; ++ } ++ ++ /* ++ * Probe all active paths in current_pg to find out whether they still work. ++ * Fail all paths that do not work. ++ * ++ * Return -ENOTCONN if no valid path is left (even outside of current_pg). We ++ * cannot probe paths in other pgs without switching current_pg, so if valid ++ * paths are only in different pgs, they may or may not work. Additionally ++ * we should not probe paths in a pathgroup that is in the process of ++ * Initializing. Userspace can submit a request and we'll switch and wait ++ * for the pathgroup to be initialized. If the request fails, it may need to ++ * probe again. ++ */ ++ static int probe_active_paths(struct multipath *m) ++ { ++ struct pgpath *pgpath; ++ struct priority_group *pg = NULL; ++ unsigned long flags; ++ int r = 0; ++ ++ spin_lock_irqsave(&m->lock, flags); ++ if (test_bit(MPATHF_DELAY_PG_SWITCH, &m->flags)) { ++ wait_event_lock_irq(m->probe_wait, ++ !test_bit(MPATHF_DELAY_PG_SWITCH, &m->flags), ++ m->lock); ++ /* ++ * if we waited because a probe was already in progress, ++ * and it probed the current active pathgroup, don't ++ * reprobe. Just return the number of valid paths ++ */ ++ if (m->current_pg == m->last_probed_pg) ++ goto skip_probe; ++ } ++ if (!m->current_pg || m->is_suspending || ++ test_bit(MPATHF_QUEUE_IO, &m->flags)) ++ goto skip_probe; ++ set_bit(MPATHF_DELAY_PG_SWITCH, &m->flags); ++ pg = m->last_probed_pg = m->current_pg; ++ spin_unlock_irqrestore(&m->lock, flags); ++ ++ list_for_each_entry(pgpath, &pg->pgpaths, list) { ++ if (pg != READ_ONCE(m->current_pg) || ++ READ_ONCE(m->is_suspending)) ++ goto out; ++ if (!pgpath->is_active) ++ continue; ++ ++ r = probe_path(pgpath); ++ if (r < 0) ++ goto out; ++ } ++ ++ out: ++ spin_lock_irqsave(&m->lock, flags); ++ clear_bit(MPATHF_DELAY_PG_SWITCH, &m->flags); ++ if (test_and_clear_bit(MPATHF_NEED_PG_SWITCH, &m->flags)) { ++ m->current_pgpath = NULL; ++ m->current_pg = NULL; ++ } ++ skip_probe: ++ if (r == 0 && !atomic_read(&m->nr_valid_paths)) ++ r = -ENOTCONN; ++ spin_unlock_irqrestore(&m->lock, flags); ++ if (pg) ++ wake_up(&m->probe_wait); ++ return r; ++ } ++ +++>>>>>>> 5c977f102315 (dm-mpath: Don't grab work_mutex while probing paths) + static int multipath_prepare_ioctl(struct dm_target *ti, + struct block_device **bdev, + unsigned int cmd, unsigned long arg, +* Unmerged path drivers/md/dm-mpath.c From 7c6318692ed0ddff3b841ba0407452f423dedfc4 Mon Sep 17 00:00:00 2001 From: Jonathan Maple Date: Tue, 8 Jul 2025 11:24:23 -0400 Subject: [PATCH 05/10] dm mpath: replace spin_lock_irqsave with spin_lock_irq jira LE-3526 Rebuild_History Non-Buildable kernel-6.12.0-55.20.1.el10_0 commit-author Mikulas Patocka commit 050a3e71ce24c6f18d70679d68056f76375ff51c Empty-Commit: Cherry-Pick Conflicts during history rebuild. Will be included in final tarball splat. Ref for failed cherry-pick at: ciq/ciq_backports/kernel-6.12.0-55.20.1.el10_0/050a3e71.failed Replace spin_lock_irqsave/spin_unlock_irqrestore with spin_lock_irq/spin_unlock_irq at places where it is known that interrupts are enabled. Signed-off-by: Mikulas Patocka Signed-off-by: Benjamin Marzinski (cherry picked from commit 050a3e71ce24c6f18d70679d68056f76375ff51c) Signed-off-by: Jonathan Maple # Conflicts: # drivers/md/dm-mpath.c --- .../050a3e71.failed | 210 ++++++++++++++++++ 1 file changed, 210 insertions(+) create mode 100644 ciq/ciq_backports/kernel-6.12.0-55.20.1.el10_0/050a3e71.failed diff --git a/ciq/ciq_backports/kernel-6.12.0-55.20.1.el10_0/050a3e71.failed b/ciq/ciq_backports/kernel-6.12.0-55.20.1.el10_0/050a3e71.failed new file mode 100644 index 0000000000000..56e6fe687d197 --- /dev/null +++ b/ciq/ciq_backports/kernel-6.12.0-55.20.1.el10_0/050a3e71.failed @@ -0,0 +1,210 @@ +dm mpath: replace spin_lock_irqsave with spin_lock_irq + +jira LE-3526 +Rebuild_History Non-Buildable kernel-6.12.0-55.20.1.el10_0 +commit-author Mikulas Patocka +commit 050a3e71ce24c6f18d70679d68056f76375ff51c +Empty-Commit: Cherry-Pick Conflicts during history rebuild. +Will be included in final tarball splat. Ref for failed cherry-pick at: +ciq/ciq_backports/kernel-6.12.0-55.20.1.el10_0/050a3e71.failed + +Replace spin_lock_irqsave/spin_unlock_irqrestore with +spin_lock_irq/spin_unlock_irq at places where it is known that interrupts +are enabled. + + Signed-off-by: Mikulas Patocka + Signed-off-by: Benjamin Marzinski +(cherry picked from commit 050a3e71ce24c6f18d70679d68056f76375ff51c) + Signed-off-by: Jonathan Maple + +# Conflicts: +# drivers/md/dm-mpath.c +diff --cc drivers/md/dm-mpath.c +index 368606afb6f0,81fec2e1e0ef..000000000000 +--- a/drivers/md/dm-mpath.c ++++ b/drivers/md/dm-mpath.c +@@@ -1476,11 -1478,15 +1469,11 @@@ static int switch_pg_num(struct multipa + if (--pgnum) + continue; + + - if (test_bit(MPATHF_DELAY_PG_SWITCH, &m->flags)) + - set_bit(MPATHF_NEED_PG_SWITCH, &m->flags); + - else { + - m->current_pgpath = NULL; + - m->current_pg = NULL; + - } + + m->current_pgpath = NULL; + + m->current_pg = NULL; + m->next_pg = pg; + } +- spin_unlock_irqrestore(&m->lock, flags); ++ spin_unlock_irq(&m->lock); + + schedule_work(&m->trigger_event); + return 0; +@@@ -1742,6 -1748,9 +1735,12 @@@ static void multipath_presuspend(struc + { + struct multipath *m = ti->private; + +++<<<<<<< HEAD +++======= ++ spin_lock_irq(&m->lock); ++ m->is_suspending = true; ++ spin_unlock_irq(&m->lock); +++>>>>>>> 050a3e71ce24 (dm mpath: replace spin_lock_irqsave with spin_lock_irq) + /* FIXME: bio-based shouldn't need to always disable queue_if_no_path */ + if (m->queue_mode == DM_TYPE_BIO_BASED || !dm_noflush_suspending(m->ti)) + queue_if_no_path(m, false, true, __func__); +@@@ -1762,9 -1771,9 +1761,13 @@@ static void multipath_postsuspend(struc + static void multipath_resume(struct dm_target *ti) + { + struct multipath *m = ti->private; +- unsigned long flags; + +++<<<<<<< HEAD + + spin_lock_irqsave(&m->lock, flags); +++======= ++ spin_lock_irq(&m->lock); ++ m->is_suspending = false; +++>>>>>>> 050a3e71ce24 (dm mpath: replace spin_lock_irqsave with spin_lock_irq) + if (test_bit(MPATHF_SAVED_QUEUE_IF_NO_PATH, &m->flags)) { + set_bit(MPATHF_QUEUE_IF_NO_PATH, &m->flags); + clear_bit(MPATHF_SAVED_QUEUE_IF_NO_PATH, &m->flags); +@@@ -2021,6 -2028,113 +2022,116 @@@ out + return r; + } + +++<<<<<<< HEAD +++======= ++ /* ++ * Perform a minimal read from the given path to find out whether the ++ * path still works. If a path error occurs, fail it. ++ */ ++ static int probe_path(struct pgpath *pgpath) ++ { ++ struct block_device *bdev = pgpath->path.dev->bdev; ++ unsigned int read_size = bdev_logical_block_size(bdev); ++ struct page *page; ++ struct bio *bio; ++ blk_status_t status; ++ int r = 0; ++ ++ if (WARN_ON_ONCE(read_size > PAGE_SIZE)) ++ return -EINVAL; ++ ++ page = alloc_page(GFP_KERNEL); ++ if (!page) ++ return -ENOMEM; ++ ++ /* Perform a minimal read: Sector 0, length read_size */ ++ bio = bio_alloc(bdev, 1, REQ_OP_READ, GFP_KERNEL); ++ if (!bio) { ++ r = -ENOMEM; ++ goto out; ++ } ++ ++ bio->bi_iter.bi_sector = 0; ++ __bio_add_page(bio, page, read_size, 0); ++ submit_bio_wait(bio); ++ status = bio->bi_status; ++ bio_put(bio); ++ ++ if (status && blk_path_error(status)) ++ fail_path(pgpath); ++ ++ out: ++ __free_page(page); ++ return r; ++ } ++ ++ /* ++ * Probe all active paths in current_pg to find out whether they still work. ++ * Fail all paths that do not work. ++ * ++ * Return -ENOTCONN if no valid path is left (even outside of current_pg). We ++ * cannot probe paths in other pgs without switching current_pg, so if valid ++ * paths are only in different pgs, they may or may not work. Additionally ++ * we should not probe paths in a pathgroup that is in the process of ++ * Initializing. Userspace can submit a request and we'll switch and wait ++ * for the pathgroup to be initialized. If the request fails, it may need to ++ * probe again. ++ */ ++ static int probe_active_paths(struct multipath *m) ++ { ++ struct pgpath *pgpath; ++ struct priority_group *pg = NULL; ++ int r = 0; ++ ++ spin_lock_irq(&m->lock); ++ if (test_bit(MPATHF_DELAY_PG_SWITCH, &m->flags)) { ++ wait_event_lock_irq(m->probe_wait, ++ !test_bit(MPATHF_DELAY_PG_SWITCH, &m->flags), ++ m->lock); ++ /* ++ * if we waited because a probe was already in progress, ++ * and it probed the current active pathgroup, don't ++ * reprobe. Just return the number of valid paths ++ */ ++ if (m->current_pg == m->last_probed_pg) ++ goto skip_probe; ++ } ++ if (!m->current_pg || m->is_suspending || ++ test_bit(MPATHF_QUEUE_IO, &m->flags)) ++ goto skip_probe; ++ set_bit(MPATHF_DELAY_PG_SWITCH, &m->flags); ++ pg = m->last_probed_pg = m->current_pg; ++ spin_unlock_irq(&m->lock); ++ ++ list_for_each_entry(pgpath, &pg->pgpaths, list) { ++ if (pg != READ_ONCE(m->current_pg) || ++ READ_ONCE(m->is_suspending)) ++ goto out; ++ if (!pgpath->is_active) ++ continue; ++ ++ r = probe_path(pgpath); ++ if (r < 0) ++ goto out; ++ } ++ ++ out: ++ spin_lock_irq(&m->lock); ++ clear_bit(MPATHF_DELAY_PG_SWITCH, &m->flags); ++ if (test_and_clear_bit(MPATHF_NEED_PG_SWITCH, &m->flags)) { ++ m->current_pgpath = NULL; ++ m->current_pg = NULL; ++ } ++ skip_probe: ++ if (r == 0 && !atomic_read(&m->nr_valid_paths)) ++ r = -ENOTCONN; ++ spin_unlock_irq(&m->lock); ++ if (pg) ++ wake_up(&m->probe_wait); ++ return r; ++ } ++ +++>>>>>>> 050a3e71ce24 (dm mpath: replace spin_lock_irqsave with spin_lock_irq) + static int multipath_prepare_ioctl(struct dm_target *ti, + struct block_device **bdev, + unsigned int cmd, unsigned long arg, +@@@ -2028,9 -2142,18 +2139,8 @@@ + { + struct multipath *m = ti->private; + struct pgpath *pgpath; +- unsigned long flags; + int r; + + - if (_IOC_TYPE(cmd) == DM_IOCTL) { + - *forward = false; + - switch (cmd) { + - case DM_MPATH_PROBE_PATHS: + - return probe_active_paths(m); + - default: + - return -ENOTTY; + - } + - } + - + pgpath = READ_ONCE(m->current_pgpath); + if (!pgpath || !mpath_double_check_test_bit(MPATHF_QUEUE_IO, m)) + pgpath = choose_pgpath(m, 0); +* Unmerged path drivers/md/dm-mpath.c From 39f27f396bf6fbe11e109656e2514b816506803a Mon Sep 17 00:00:00 2001 From: Jonathan Maple Date: Tue, 8 Jul 2025 11:24:23 -0400 Subject: [PATCH 06/10] vmxnet3: Fix malformed packet sizing in vmxnet3_process_xdp jira LE-3526 cve CVE-2025-37799 Rebuild_History Non-Buildable kernel-6.12.0-55.20.1.el10_0 commit-author Daniel Borkmann commit 4c2227656d9003f4d77afc76f34dd81b95e4c2c4 vmxnet3 driver's XDP handling is buggy for packet sizes using ring0 (that is, packet sizes between 128 - 3k bytes). We noticed MTU-related connectivity issues with Cilium's service load- balancing in case of vmxnet3 as NIC underneath. A simple curl to a HTTP backend service where the XDP LB was doing IPIP encap led to overly large packet sizes but only for *some* of the packets (e.g. HTTP GET request) while others (e.g. the prior TCP 3WHS) looked completely fine on the wire. In fact, the pcap recording on the backend node actually revealed that the node with the XDP LB was leaking uninitialized kernel data onto the wire for the affected packets, for example, while the packets should have been 152 bytes their actual size was 1482 bytes, so the remainder after 152 bytes was padded with whatever other data was in that page at the time (e.g. we saw user/payload data from prior processed packets). We only noticed this through an MTU issue, e.g. when the XDP LB node and the backend node both had the same MTU (e.g. 1500) then the curl request got dropped on the backend node's NIC given the packet was too large even though the IPIP-encapped packet normally would never even come close to the MTU limit. Lowering the MTU on the XDP LB (e.g. 1480) allowed to let the curl request succeed (which also indicates that the kernel ignored the padding, and thus the issue wasn't very user-visible). Commit e127ce7699c1 ("vmxnet3: Fix missing reserved tailroom") was too eager to also switch xdp_prepare_buff() from rcd->len to rbi->len. It really needs to stick to rcd->len which is the actual packet length from the descriptor. The latter we also feed into vmxnet3_process_xdp_small(), by the way, and it indicates the correct length needed to initialize the xdp->{data,data_end} parts. For e127ce7699c1 ("vmxnet3: Fix missing reserved tailroom") the relevant part was adapting xdp_init_buff() to address the warning given the xdp_data_hard_end() depends on xdp->frame_sz. With that fixed, traffic on the wire looks good again. Fixes: e127ce7699c1 ("vmxnet3: Fix missing reserved tailroom") Signed-off-by: Daniel Borkmann Tested-by: Andrew Sauber Cc: Anton Protopopov Cc: William Tu Cc: Martin Zaharinov Cc: Ronak Doshi Reviewed-by: Simon Horman Link: https://patch.msgid.link/20250423133600.176689-1-daniel@iogearbox.net Signed-off-by: Jakub Kicinski (cherry picked from commit 4c2227656d9003f4d77afc76f34dd81b95e4c2c4) Signed-off-by: Jonathan Maple --- drivers/net/vmxnet3/vmxnet3_xdp.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/net/vmxnet3/vmxnet3_xdp.c b/drivers/net/vmxnet3/vmxnet3_xdp.c index 1341374a4588a..99c3c1e2db4d6 100644 --- a/drivers/net/vmxnet3/vmxnet3_xdp.c +++ b/drivers/net/vmxnet3/vmxnet3_xdp.c @@ -387,7 +387,7 @@ vmxnet3_process_xdp(struct vmxnet3_adapter *adapter, xdp_init_buff(&xdp, PAGE_SIZE, &rq->xdp_rxq); xdp_prepare_buff(&xdp, page_address(page), rq->page_pool->p.offset, - rbi->len, false); + rcd->len, false); xdp_buff_clear_frags_flag(&xdp); xdp_prog = rcu_dereference(rq->adapter->xdp_bpf_prog); From 959fe9bdb180e1be073f047701c234465b7edf50 Mon Sep 17 00:00:00 2001 From: Jonathan Maple Date: Tue, 8 Jul 2025 11:24:24 -0400 Subject: [PATCH 07/10] page_pool: Move pp_magic check into helper functions MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit jira LE-3526 Rebuild_History Non-Buildable kernel-6.12.0-55.20.1.el10_0 commit-author Toke Høiland-Jørgensen commit cd3c93167da0e760b5819246eae7a4ea30fd014b Since we are about to stash some more information into the pp_magic field, let's move the magic signature checks into a pair of helper functions so it can be changed in one place. Reviewed-by: Mina Almasry Tested-by: Yonglong Liu Acked-by: Jesper Dangaard Brouer Reviewed-by: Ilias Apalodimas Signed-off-by: Toke Høiland-Jørgensen Link: https://patch.msgid.link/20250409-page-pool-track-dma-v9-1-6a9ef2e0cba8@redhat.com Signed-off-by: Jakub Kicinski (cherry picked from commit cd3c93167da0e760b5819246eae7a4ea30fd014b) Signed-off-by: Jonathan Maple --- .../net/ethernet/mellanox/mlx5/core/en/xdp.c | 4 ++-- include/linux/mm.h | 20 +++++++++++++++++++ mm/page_alloc.c | 8 ++------ net/core/netmem_priv.h | 5 +++++ net/core/skbuff.c | 16 ++------------- net/core/xdp.c | 4 ++-- 6 files changed, 33 insertions(+), 24 deletions(-) diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en/xdp.c b/drivers/net/ethernet/mellanox/mlx5/core/en/xdp.c index 4610621a340e5..fdb5e5c3a5e79 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/en/xdp.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/en/xdp.c @@ -713,8 +713,8 @@ static void mlx5e_free_xdpsq_desc(struct mlx5e_xdpsq *sq, xdpi = mlx5e_xdpi_fifo_pop(xdpi_fifo); page = xdpi.page.page; - /* No need to check ((page->pp_magic & ~0x3UL) == PP_SIGNATURE) - * as we know this is a page_pool page. + /* No need to check page_pool_page_is_pp() as we + * know this is a page_pool page. */ page_pool_recycle_direct(page->pp, page); } while (++n < num); diff --git a/include/linux/mm.h b/include/linux/mm.h index 8235a18fc9af0..69bfae76c4fd5 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -4224,4 +4224,24 @@ static inline void pgalloc_tag_copy(struct folio *new, struct folio *old) } #endif /* CONFIG_MEM_ALLOC_PROFILING */ +/* Mask used for checking in page_pool_page_is_pp() below. page->pp_magic is + * OR'ed with PP_SIGNATURE after the allocation in order to preserve bit 0 for + * the head page of compound page and bit 1 for pfmemalloc page. + * page_is_pfmemalloc() is checked in __page_pool_put_page() to avoid recycling + * the pfmemalloc page. + */ +#define PP_MAGIC_MASK ~0x3UL + +#ifdef CONFIG_PAGE_POOL +static inline bool page_pool_page_is_pp(struct page *page) +{ + return (page->pp_magic & PP_MAGIC_MASK) == PP_SIGNATURE; +} +#else +static inline bool page_pool_page_is_pp(struct page *page) +{ + return false; +} +#endif + #endif /* _LINUX_MM_H */ diff --git a/mm/page_alloc.c b/mm/page_alloc.c index b6958333054d0..fdc2d8345ba3a 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -872,9 +872,7 @@ static inline bool page_expected_state(struct page *page, #ifdef CONFIG_MEMCG page->memcg_data | #endif -#ifdef CONFIG_PAGE_POOL - ((page->pp_magic & ~0x3UL) == PP_SIGNATURE) | -#endif + page_pool_page_is_pp(page) | (page->flags & check_flags))) return false; @@ -901,10 +899,8 @@ static const char *page_bad_reason(struct page *page, unsigned long flags) if (unlikely(page->memcg_data)) bad_reason = "page still charged to cgroup"; #endif -#ifdef CONFIG_PAGE_POOL - if (unlikely((page->pp_magic & ~0x3UL) == PP_SIGNATURE)) + if (unlikely(page_pool_page_is_pp(page))) bad_reason = "page_pool leak"; -#endif return bad_reason; } diff --git a/net/core/netmem_priv.h b/net/core/netmem_priv.h index 7eadb8393e002..f33162fd281c2 100644 --- a/net/core/netmem_priv.h +++ b/net/core/netmem_priv.h @@ -18,6 +18,11 @@ static inline void netmem_clear_pp_magic(netmem_ref netmem) __netmem_clear_lsb(netmem)->pp_magic = 0; } +static inline bool netmem_is_pp(netmem_ref netmem) +{ + return (netmem_get_pp_magic(netmem) & PP_MAGIC_MASK) == PP_SIGNATURE; +} + static inline void netmem_set_pp(netmem_ref netmem, struct page_pool *pool) { __netmem_clear_lsb(netmem)->pp = pool; diff --git a/net/core/skbuff.c b/net/core/skbuff.c index 74149dc4ee318..0e8ccae8c5655 100644 --- a/net/core/skbuff.c +++ b/net/core/skbuff.c @@ -921,11 +921,6 @@ static void skb_clone_fraglist(struct sk_buff *skb) skb_get(list); } -static bool is_pp_netmem(netmem_ref netmem) -{ - return (netmem_get_pp_magic(netmem) & ~0x3UL) == PP_SIGNATURE; -} - int skb_pp_cow_data(struct page_pool *pool, struct sk_buff **pskb, unsigned int headroom) { @@ -1023,14 +1018,7 @@ bool napi_pp_put_page(netmem_ref netmem) { netmem = netmem_compound_head(netmem); - /* page->pp_magic is OR'ed with PP_SIGNATURE after the allocation - * in order to preserve any existing bits, such as bit 0 for the - * head page of compound page and bit 1 for pfmemalloc page, so - * mask those bits for freeing side when doing below checking, - * and page_is_pfmemalloc() is checked in __page_pool_put_page() - * to avoid recycling the pfmemalloc page. - */ - if (unlikely(!is_pp_netmem(netmem))) + if (unlikely(!netmem_is_pp(netmem))) return false; page_pool_put_full_netmem(netmem_get_pp(netmem), netmem, false); @@ -1070,7 +1058,7 @@ static int skb_pp_frag_ref(struct sk_buff *skb) for (i = 0; i < shinfo->nr_frags; i++) { head_netmem = netmem_compound_head(shinfo->frags[i].netmem); - if (likely(is_pp_netmem(head_netmem))) + if (likely(netmem_is_pp(head_netmem))) page_pool_ref_netmem(head_netmem); else page_ref_inc(netmem_to_page(head_netmem)); diff --git a/net/core/xdp.c b/net/core/xdp.c index bcc5551c6424b..23e7d736718b0 100644 --- a/net/core/xdp.c +++ b/net/core/xdp.c @@ -381,8 +381,8 @@ void __xdp_return(void *data, struct xdp_mem_info *mem, bool napi_direct, page = virt_to_head_page(data); if (napi_direct && xdp_return_frame_no_direct()) napi_direct = false; - /* No need to check ((page->pp_magic & ~0x3UL) == PP_SIGNATURE) - * as mem->type knows this a page_pool page + /* No need to check netmem_is_pp() as mem->type knows this a + * page_pool page */ page_pool_put_full_page(page->pp, page, napi_direct); break; From 438548951ea3d9009dffc212b24e3c1c8458b099 Mon Sep 17 00:00:00 2001 From: Jonathan Maple Date: Tue, 8 Jul 2025 11:24:24 -0400 Subject: [PATCH 08/10] page_pool: Track DMA-mapped pages and unmap them when destroying the pool MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit jira LE-3526 Rebuild_History Non-Buildable kernel-6.12.0-55.20.1.el10_0 commit-author Toke Høiland-Jørgensen commit ee62ce7a1d909ccba0399680a03c2dee83bcae95 When enabling DMA mapping in page_pool, pages are kept DMA mapped until they are released from the pool, to avoid the overhead of re-mapping the pages every time they are used. This causes resource leaks and/or crashes when there are pages still outstanding while the device is torn down, because page_pool will attempt an unmap through a non-existent DMA device on the subsequent page return. To fix this, implement a simple tracking of outstanding DMA-mapped pages in page pool using an xarray. This was first suggested by Mina[0], and turns out to be fairly straight forward: We simply store pointers to pages directly in the xarray with xa_alloc() when they are first DMA mapped, and remove them from the array on unmap. Then, when a page pool is torn down, it can simply walk the xarray and unmap all pages still present there before returning, which also allows us to get rid of the get/put_device() calls in page_pool. Using xa_cmpxchg(), no additional synchronisation is needed, as a page will only ever be unmapped once. To avoid having to walk the entire xarray on unmap to find the page reference, we stash the ID assigned by xa_alloc() into the page structure itself, using the upper bits of the pp_magic field. This requires a couple of defines to avoid conflicting with the POINTER_POISON_DELTA define, but this is all evaluated at compile-time, so does not affect run-time performance. The bitmap calculations in this patch gives the following number of bits for different architectures: - 23 bits on 32-bit architectures - 21 bits on PPC64 (because of the definition of ILLEGAL_POINTER_VALUE) - 32 bits on other 64-bit architectures Stashing a value into the unused bits of pp_magic does have the effect that it can make the value stored there lie outside the unmappable range (as governed by the mmap_min_addr sysctl), for architectures that don't define ILLEGAL_POINTER_VALUE. This means that if one of the pointers that is aliased to the pp_magic field (such as page->lru.next) is dereferenced while the page is owned by page_pool, that could lead to a dereference into userspace, which is a security concern. The risk of this is mitigated by the fact that (a) we always clear pp_magic before releasing a page from page_pool, and (b) this would need a use-after-free bug for struct page, which can have many other risks since page->lru.next is used as a generic list pointer in multiple places in the kernel. As such, with this patch we take the position that this risk is negligible in practice. For more discussion, see[1]. Since all the tracking added in this patch is performed on DMA map/unmap, no additional code is needed in the fast path, meaning the performance overhead of this tracking is negligible there. A micro-benchmark shows that the total overhead of the tracking itself is about 400 ns (39 cycles(tsc) 395.218 ns; sum for both map and unmap[2]). Since this cost is only paid on DMA map and unmap, it seems like an acceptable cost to fix the late unmap issue. Further optimisation can narrow the cases where this cost is paid (for instance by eliding the tracking when DMA map/unmap is a no-op). The extra memory needed to track the pages is neatly encapsulated inside xarray, which uses the 'struct xa_node' structure to track items. This structure is 576 bytes long, with slots for 64 items, meaning that a full node occurs only 9 bytes of overhead per slot it tracks (in practice, it probably won't be this efficient, but in any case it should be an acceptable overhead). [0] https://lore.kernel.org/all/CAHS8izPg7B5DwKfSuzz-iOop_YRbk3Sd6Y4rX7KBG9DcVJcyWg@mail.gmail.com/ [1] https://lore.kernel.org/r/20250320023202.GA25514@openwall.com [2] https://lore.kernel.org/r/ae07144c-9295-4c9d-a400-153bb689fe9e@huawei.com Reported-by: Yonglong Liu Closes: https://lore.kernel.org/r/8743264a-9700-4227-a556-5f931c720211@huawei.com Fixes: ff7d6b27f894 ("page_pool: refurbish version of page_pool code") Suggested-by: Mina Almasry Reviewed-by: Mina Almasry Reviewed-by: Jesper Dangaard Brouer Tested-by: Jesper Dangaard Brouer Tested-by: Qiuling Ren Tested-by: Yuying Ma Tested-by: Yonglong Liu Acked-by: Jesper Dangaard Brouer Signed-off-by: Toke Høiland-Jørgensen Link: https://patch.msgid.link/20250409-page-pool-track-dma-v9-2-6a9ef2e0cba8@redhat.com Signed-off-by: Jakub Kicinski (cherry picked from commit ee62ce7a1d909ccba0399680a03c2dee83bcae95) Signed-off-by: Jonathan Maple --- include/linux/mm.h | 46 ++++++++++++++++++-- include/linux/poison.h | 4 ++ include/net/page_pool/types.h | 6 +++ net/core/netmem_priv.h | 28 +++++++++++- net/core/page_pool.c | 81 +++++++++++++++++++++++++++++------ 5 files changed, 147 insertions(+), 18 deletions(-) diff --git a/include/linux/mm.h b/include/linux/mm.h index 69bfae76c4fd5..5478c1ca5050f 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -4224,13 +4224,51 @@ static inline void pgalloc_tag_copy(struct folio *new, struct folio *old) } #endif /* CONFIG_MEM_ALLOC_PROFILING */ +/* + * DMA mapping IDs for page_pool + * + * When DMA-mapping a page, page_pool allocates an ID (from an xarray) and + * stashes it in the upper bits of page->pp_magic. We always want to be able to + * unambiguously identify page pool pages (using page_pool_page_is_pp()). Non-PP + * pages can have arbitrary kernel pointers stored in the same field as pp_magic + * (since it overlaps with page->lru.next), so we must ensure that we cannot + * mistake a valid kernel pointer with any of the values we write into this + * field. + * + * On architectures that set POISON_POINTER_DELTA, this is already ensured, + * since this value becomes part of PP_SIGNATURE; meaning we can just use the + * space between the PP_SIGNATURE value (without POISON_POINTER_DELTA), and the + * lowest bits of POISON_POINTER_DELTA. On arches where POISON_POINTER_DELTA is + * 0, we make sure that we leave the two topmost bits empty, as that guarantees + * we won't mistake a valid kernel pointer for a value we set, regardless of the + * VMSPLIT setting. + * + * Altogether, this means that the number of bits available is constrained by + * the size of an unsigned long (at the upper end, subtracting two bits per the + * above), and the definition of PP_SIGNATURE (with or without + * POISON_POINTER_DELTA). + */ +#define PP_DMA_INDEX_SHIFT (1 + __fls(PP_SIGNATURE - POISON_POINTER_DELTA)) +#if POISON_POINTER_DELTA > 0 +/* PP_SIGNATURE includes POISON_POINTER_DELTA, so limit the size of the DMA + * index to not overlap with that if set + */ +#define PP_DMA_INDEX_BITS MIN(32, __ffs(POISON_POINTER_DELTA) - PP_DMA_INDEX_SHIFT) +#else +/* Always leave out the topmost two; see above. */ +#define PP_DMA_INDEX_BITS MIN(32, BITS_PER_LONG - PP_DMA_INDEX_SHIFT - 2) +#endif + +#define PP_DMA_INDEX_MASK GENMASK(PP_DMA_INDEX_BITS + PP_DMA_INDEX_SHIFT - 1, \ + PP_DMA_INDEX_SHIFT) + /* Mask used for checking in page_pool_page_is_pp() below. page->pp_magic is * OR'ed with PP_SIGNATURE after the allocation in order to preserve bit 0 for - * the head page of compound page and bit 1 for pfmemalloc page. - * page_is_pfmemalloc() is checked in __page_pool_put_page() to avoid recycling - * the pfmemalloc page. + * the head page of compound page and bit 1 for pfmemalloc page, as well as the + * bits used for the DMA index. page_is_pfmemalloc() is checked in + * __page_pool_put_page() to avoid recycling the pfmemalloc page. */ -#define PP_MAGIC_MASK ~0x3UL +#define PP_MAGIC_MASK ~(PP_DMA_INDEX_MASK | 0x3UL) #ifdef CONFIG_PAGE_POOL static inline bool page_pool_page_is_pp(struct page *page) diff --git a/include/linux/poison.h b/include/linux/poison.h index 331a9a996fa87..8ca2235f78d5d 100644 --- a/include/linux/poison.h +++ b/include/linux/poison.h @@ -70,6 +70,10 @@ #define KEY_DESTROY 0xbd /********** net/core/page_pool.c **********/ +/* + * page_pool uses additional free bits within this value to store data, see the + * definition of PP_DMA_INDEX_MASK in mm.h + */ #define PP_SIGNATURE (0x40 + POISON_POINTER_DELTA) /********** net/core/skbuff.c **********/ diff --git a/include/net/page_pool/types.h b/include/net/page_pool/types.h index c022c410abe39..f53e2c90b6866 100644 --- a/include/net/page_pool/types.h +++ b/include/net/page_pool/types.h @@ -6,6 +6,7 @@ #include #include #include +#include #include #define PP_FLAG_DMA_MAP BIT(0) /* Should page_pool do the DMA @@ -33,6 +34,9 @@ #define PP_FLAG_ALL (PP_FLAG_DMA_MAP | PP_FLAG_DMA_SYNC_DEV | \ PP_FLAG_SYSTEM_POOL | PP_FLAG_ALLOW_UNREADABLE_NETMEM) +/* Index limit to stay within PP_DMA_INDEX_BITS for DMA indices */ +#define PP_DMA_INDEX_LIMIT XA_LIMIT(1, BIT(PP_DMA_INDEX_BITS) - 1) + /* * Fast allocation side cache array/stack * @@ -216,6 +220,8 @@ struct page_pool { void *mp_priv; + struct xarray dma_mapped; + #ifdef CONFIG_PAGE_POOL_STATS /* recycle stats are per-cpu to avoid locking */ struct page_pool_recycle_stats __percpu *recycle_stats; diff --git a/net/core/netmem_priv.h b/net/core/netmem_priv.h index f33162fd281c2..cd95394399b40 100644 --- a/net/core/netmem_priv.h +++ b/net/core/netmem_priv.h @@ -5,7 +5,7 @@ static inline unsigned long netmem_get_pp_magic(netmem_ref netmem) { - return __netmem_clear_lsb(netmem)->pp_magic; + return __netmem_clear_lsb(netmem)->pp_magic & ~PP_DMA_INDEX_MASK; } static inline void netmem_or_pp_magic(netmem_ref netmem, unsigned long pp_magic) @@ -15,6 +15,8 @@ static inline void netmem_or_pp_magic(netmem_ref netmem, unsigned long pp_magic) static inline void netmem_clear_pp_magic(netmem_ref netmem) { + WARN_ON_ONCE(__netmem_clear_lsb(netmem)->pp_magic & PP_DMA_INDEX_MASK); + __netmem_clear_lsb(netmem)->pp_magic = 0; } @@ -33,4 +35,28 @@ static inline void netmem_set_dma_addr(netmem_ref netmem, { __netmem_clear_lsb(netmem)->dma_addr = dma_addr; } + +static inline unsigned long netmem_get_dma_index(netmem_ref netmem) +{ + unsigned long magic; + + if (WARN_ON_ONCE(netmem_is_net_iov(netmem))) + return 0; + + magic = __netmem_clear_lsb(netmem)->pp_magic; + + return (magic & PP_DMA_INDEX_MASK) >> PP_DMA_INDEX_SHIFT; +} + +static inline void netmem_set_dma_index(netmem_ref netmem, + unsigned long id) +{ + unsigned long magic; + + if (WARN_ON_ONCE(netmem_is_net_iov(netmem))) + return; + + magic = netmem_get_pp_magic(netmem) | (id << PP_DMA_INDEX_SHIFT); + __netmem_clear_lsb(netmem)->pp_magic = magic; +} #endif diff --git a/net/core/page_pool.c b/net/core/page_pool.c index a813d30d21353..5de406a7affd0 100644 --- a/net/core/page_pool.c +++ b/net/core/page_pool.c @@ -272,8 +272,7 @@ static int page_pool_init(struct page_pool *pool, /* Driver calling page_pool_create() also call page_pool_destroy() */ refcount_set(&pool->user_cnt, 1); - if (pool->dma_map) - get_device(pool->p.dev); + xa_init_flags(&pool->dma_mapped, XA_FLAGS_ALLOC1); if (pool->slow.flags & PP_FLAG_ALLOW_UNREADABLE_NETMEM) { /* We rely on rtnl_lock()ing to make sure netdev_rx_queue @@ -311,9 +310,7 @@ static int page_pool_init(struct page_pool *pool, static void page_pool_uninit(struct page_pool *pool) { ptr_ring_cleanup(&pool->ring, NULL); - - if (pool->dma_map) - put_device(pool->p.dev); + xa_destroy(&pool->dma_mapped); #ifdef CONFIG_PAGE_POOL_STATS if (!pool->system) @@ -454,13 +451,21 @@ page_pool_dma_sync_for_device(const struct page_pool *pool, netmem_ref netmem, u32 dma_sync_size) { - if (pool->dma_sync && dma_dev_need_sync(pool->p.dev)) - __page_pool_dma_sync_for_device(pool, netmem, dma_sync_size); + if (pool->dma_sync && dma_dev_need_sync(pool->p.dev)) { + rcu_read_lock(); + /* re-check under rcu_read_lock() to sync with page_pool_scrub() */ + if (pool->dma_sync) + __page_pool_dma_sync_for_device(pool, netmem, + dma_sync_size); + rcu_read_unlock(); + } } -static bool page_pool_dma_map(struct page_pool *pool, netmem_ref netmem) +static bool page_pool_dma_map(struct page_pool *pool, netmem_ref netmem, gfp_t gfp) { dma_addr_t dma; + int err; + u32 id; /* Setup DMA mapping: use 'struct page' area for storing DMA-addr * since dma_addr_t can be either 32 or 64 bits and does not always fit @@ -474,15 +479,30 @@ static bool page_pool_dma_map(struct page_pool *pool, netmem_ref netmem) if (dma_mapping_error(pool->p.dev, dma)) return false; - if (page_pool_set_dma_addr_netmem(netmem, dma)) + if (page_pool_set_dma_addr_netmem(netmem, dma)) { + WARN_ONCE(1, "unexpected DMA address, please report to netdev@"); goto unmap_failed; + } + + if (in_softirq()) + err = xa_alloc(&pool->dma_mapped, &id, netmem_to_page(netmem), + PP_DMA_INDEX_LIMIT, gfp); + else + err = xa_alloc_bh(&pool->dma_mapped, &id, netmem_to_page(netmem), + PP_DMA_INDEX_LIMIT, gfp); + if (err) { + WARN_ONCE(err != -ENOMEM, "couldn't track DMA mapping, please report to netdev@"); + goto unset_failed; + } + netmem_set_dma_index(netmem, id); page_pool_dma_sync_for_device(pool, netmem, pool->p.max_len); return true; +unset_failed: + page_pool_set_dma_addr_netmem(netmem, 0); unmap_failed: - WARN_ONCE(1, "unexpected DMA address, please report to netdev@"); dma_unmap_page_attrs(pool->p.dev, dma, PAGE_SIZE << pool->p.order, pool->p.dma_dir, DMA_ATTR_SKIP_CPU_SYNC | DMA_ATTR_WEAK_ORDERING); @@ -499,7 +519,7 @@ static struct page *__page_pool_alloc_page_order(struct page_pool *pool, if (unlikely(!page)) return NULL; - if (pool->dma_map && unlikely(!page_pool_dma_map(pool, page_to_netmem(page)))) { + if (pool->dma_map && unlikely(!page_pool_dma_map(pool, page_to_netmem(page), gfp))) { put_page(page); return NULL; } @@ -546,7 +566,7 @@ static noinline netmem_ref __page_pool_alloc_pages_slow(struct page_pool *pool, */ for (i = 0; i < nr_pages; i++) { netmem = pool->alloc.cache[i]; - if (dma_map && unlikely(!page_pool_dma_map(pool, netmem))) { + if (dma_map && unlikely(!page_pool_dma_map(pool, netmem, gfp))) { put_page(netmem_to_page(netmem)); continue; } @@ -648,6 +668,8 @@ void page_pool_clear_pp_info(netmem_ref netmem) static __always_inline void __page_pool_release_page_dma(struct page_pool *pool, netmem_ref netmem) { + struct page *old, *page = netmem_to_page(netmem); + unsigned long id; dma_addr_t dma; if (!pool->dma_map) @@ -656,6 +678,17 @@ static __always_inline void __page_pool_release_page_dma(struct page_pool *pool, */ return; + id = netmem_get_dma_index(netmem); + if (!id) + return; + + if (in_softirq()) + old = xa_cmpxchg(&pool->dma_mapped, id, page, NULL, 0); + else + old = xa_cmpxchg_bh(&pool->dma_mapped, id, page, NULL, 0); + if (old != page) + return; + dma = page_pool_get_dma_addr_netmem(netmem); /* When page is unmapped, it cannot be returned to our pool */ @@ -663,6 +696,7 @@ static __always_inline void __page_pool_release_page_dma(struct page_pool *pool, PAGE_SIZE << pool->p.order, pool->p.dma_dir, DMA_ATTR_SKIP_CPU_SYNC | DMA_ATTR_WEAK_ORDERING); page_pool_set_dma_addr_netmem(netmem, 0); + netmem_set_dma_index(netmem, 0); } /* Disconnects a page (from a page_pool). API users can have a need @@ -1037,8 +1071,29 @@ static void page_pool_empty_alloc_cache_once(struct page_pool *pool) static void page_pool_scrub(struct page_pool *pool) { + unsigned long id; + void *ptr; + page_pool_empty_alloc_cache_once(pool); - pool->destroy_cnt++; + if (!pool->destroy_cnt++ && pool->dma_map) { + if (pool->dma_sync) { + /* Disable page_pool_dma_sync_for_device() */ + pool->dma_sync = false; + + /* Make sure all concurrent returns that may see the old + * value of dma_sync (and thus perform a sync) have + * finished before doing the unmapping below. Skip the + * wait if the device doesn't actually need syncing, or + * if there are no outstanding mapped pages. + */ + if (dma_dev_need_sync(pool->p.dev) && + !xa_empty(&pool->dma_mapped)) + synchronize_net(); + } + + xa_for_each(&pool->dma_mapped, id, ptr) + __page_pool_release_page_dma(pool, page_to_netmem(ptr)); + } /* No more consumers should exist, but producers could still * be in-flight. From ccc3544ac2a1b417ff94e53dd0ade3f3d5a2f768 Mon Sep 17 00:00:00 2001 From: Jonathan Maple Date: Tue, 8 Jul 2025 11:24:24 -0400 Subject: [PATCH 09/10] x86/microcode/AMD: Fix out-of-bounds on systems with CPU-less NUMA nodes jira LE-3526 cve CVE-2025-21991 Rebuild_History Non-Buildable kernel-6.12.0-55.20.1.el10_0 commit-author Florent Revest commit e3e89178a9f4a80092578af3ff3c8478f9187d59 Currently, load_microcode_amd() iterates over all NUMA nodes, retrieves their CPU masks and unconditionally accesses per-CPU data for the first CPU of each mask. According to Documentation/admin-guide/mm/numaperf.rst: "Some memory may share the same node as a CPU, and others are provided as memory only nodes." Therefore, some node CPU masks may be empty and wouldn't have a "first CPU". On a machine with far memory (and therefore CPU-less NUMA nodes): - cpumask_of_node(nid) is 0 - cpumask_first(0) is CONFIG_NR_CPUS - cpu_data(CONFIG_NR_CPUS) accesses the cpu_info per-CPU array at an index that is 1 out of bounds This does not have any security implications since flashing microcode is a privileged operation but I believe this has reliability implications by potentially corrupting memory while flashing a microcode update. When booting with CONFIG_UBSAN_BOUNDS=y on an AMD machine that flashes a microcode update. I get the following splat: UBSAN: array-index-out-of-bounds in arch/x86/kernel/cpu/microcode/amd.c:X:Y index 512 is out of range for type 'unsigned long[512]' [...] Call Trace: dump_stack __ubsan_handle_out_of_bounds load_microcode_amd request_microcode_amd reload_store kernfs_fop_write_iter vfs_write ksys_write do_syscall_64 entry_SYSCALL_64_after_hwframe Change the loop to go over only NUMA nodes which have CPUs before determining whether the first CPU on the respective node needs microcode update. [ bp: Massage commit message, fix typo. ] Fixes: 7ff6edf4fef3 ("x86/microcode/AMD: Fix mixed steppings support") Signed-off-by: Florent Revest Signed-off-by: Borislav Petkov (AMD) Cc: stable@vger.kernel.org Link: https://lore.kernel.org/r/20250310144243.861978-1-revest@chromium.org (cherry picked from commit e3e89178a9f4a80092578af3ff3c8478f9187d59) Signed-off-by: Jonathan Maple --- arch/x86/kernel/cpu/microcode/amd.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/x86/kernel/cpu/microcode/amd.c b/arch/x86/kernel/cpu/microcode/amd.c index 31a73715d7553..a2afbaae99c70 100644 --- a/arch/x86/kernel/cpu/microcode/amd.c +++ b/arch/x86/kernel/cpu/microcode/amd.c @@ -979,7 +979,7 @@ static enum ucode_state load_microcode_amd(u8 family, const u8 *data, size_t siz if (ret != UCODE_OK) return ret; - for_each_node(nid) { + for_each_node_with_cpus(nid) { cpu = cpumask_first(cpumask_of_node(nid)); c = &cpu_data(cpu); From 3381775694c1e4264886dd2aae5e5ce8dc05747e Mon Sep 17 00:00:00 2001 From: Jonathan Maple Date: Tue, 8 Jul 2025 11:24:46 -0400 Subject: [PATCH 10/10] Rebuild rocky10_0 with kernel-6.12.0-55.20.1.el10_0 Rebuild_History BUILDABLE Rebuilding Kernel from rpm changelog with Fuzz Limit: 87.50% Number of commits in upstream range v6.12~1..kernel-mainline: 52012 Number of commits in rpm: 14 Number of commits matched with upstream: 10 (71.43%) Number of commits in upstream but not in rpm: 52002 Number of commits NOT found in upstream: 4 (28.57%) Rebuilding Kernel on Branch rocky10_0_rebuild_kernel-6.12.0-55.20.1.el10_0 for kernel-6.12.0-55.20.1.el10_0 Clean Cherry Picks: 6 (60.00%) Empty Cherry Picks: 3 (30.00%) _______________________________ Full Details Located here: ciq/ciq_backports/kernel-6.12.0-55.20.1.el10_0/rebuild.details.txt Includes: * git commit header above * Empty Commits with upstream SHA * RPM ChangeLog Entries that could not be matched Individual Empty Commit failures contained in the same containing directory. The git message for empty commits will have the path for the failed commit. File names are the first 8 characters of the upstream SHA --- ....1.el10_0 => COPYING-6.12.0-55.20.1.el10_0 | 0 Makefile.rhelver | 2 +- .../rebuild.details.txt | 23 ++ .../kernel-6.12.0-aarch64-64k-debug.config | 4 +- configs/kernel-6.12.0-aarch64-64k.config | 4 +- ...nel-6.12.0-aarch64-automotive-debug.config | 4 +- .../kernel-6.12.0-aarch64-automotive.config | 4 +- configs/kernel-6.12.0-aarch64-debug.config | 4 +- .../kernel-6.12.0-aarch64-rt-64k-debug.config | 4 +- configs/kernel-6.12.0-aarch64-rt-64k.config | 4 +- configs/kernel-6.12.0-aarch64-rt-debug.config | 4 +- configs/kernel-6.12.0-aarch64-rt.config | 4 +- configs/kernel-6.12.0-aarch64.config | 4 +- configs/kernel-6.12.0-ppc64le-debug.config | 4 +- configs/kernel-6.12.0-ppc64le.config | 4 +- configs/kernel-6.12.0-riscv64-debug.config | 4 +- configs/kernel-6.12.0-riscv64.config | 4 +- configs/kernel-6.12.0-s390x-debug.config | 4 +- configs/kernel-6.12.0-s390x-zfcpdump.config | 4 +- configs/kernel-6.12.0-s390x.config | 4 +- ...rnel-6.12.0-x86_64-automotive-debug.config | 4 +- .../kernel-6.12.0-x86_64-automotive.config | 4 +- configs/kernel-6.12.0-x86_64-debug.config | 4 +- configs/kernel-6.12.0-x86_64-rt-debug.config | 4 +- configs/kernel-6.12.0-x86_64-rt.config | 4 +- configs/kernel-6.12.0-x86_64.config | 4 +- drivers/md/dm-ioctl.c | 1 + drivers/md/dm-mpath.c | 239 +++++++++++++----- drivers/scsi/storvsc_drv.c | 1 + include/linux/mm.h | 116 ++++----- include/net/page_pool/types.h | 4 +- include/uapi/linux/dm-ioctl.h | 9 +- redhat/kernel.changelog-10.0 | 14 + 33 files changed, 334 insertions(+), 167 deletions(-) rename COPYING-6.12.0-55.19.1.el10_0 => COPYING-6.12.0-55.20.1.el10_0 (100%) create mode 100644 ciq/ciq_backports/kernel-6.12.0-55.20.1.el10_0/rebuild.details.txt diff --git a/COPYING-6.12.0-55.19.1.el10_0 b/COPYING-6.12.0-55.20.1.el10_0 similarity index 100% rename from COPYING-6.12.0-55.19.1.el10_0 rename to COPYING-6.12.0-55.20.1.el10_0 diff --git a/Makefile.rhelver b/Makefile.rhelver index 723a137bd81e8..825a53542b189 100644 --- a/Makefile.rhelver +++ b/Makefile.rhelver @@ -12,7 +12,7 @@ RHEL_MINOR = 0 # # Use this spot to avoid future merge conflicts. # Do not trim this comment. -RHEL_RELEASE = 55.19.1 +RHEL_RELEASE = 55.20.1 # # RHEL_REBASE_NUM diff --git a/ciq/ciq_backports/kernel-6.12.0-55.20.1.el10_0/rebuild.details.txt b/ciq/ciq_backports/kernel-6.12.0-55.20.1.el10_0/rebuild.details.txt new file mode 100644 index 0000000000000..a20f918b2ed86 --- /dev/null +++ b/ciq/ciq_backports/kernel-6.12.0-55.20.1.el10_0/rebuild.details.txt @@ -0,0 +1,23 @@ +Rebuild_History BUILDABLE +Rebuilding Kernel from rpm changelog with Fuzz Limit: 87.50% +Number of commits in upstream range v6.12~1..kernel-mainline: 52012 +Number of commits in rpm: 14 +Number of commits matched with upstream: 10 (71.43%) +Number of commits in upstream but not in rpm: 52002 +Number of commits NOT found in upstream: 4 (28.57%) + +Rebuilding Kernel on Branch rocky10_0_rebuild_kernel-6.12.0-55.20.1.el10_0 for kernel-6.12.0-55.20.1.el10_0 +Clean Cherry Picks: 6 (60.00%) +Empty Cherry Picks: 3 (30.00%) +_______________________________ + +__EMPTY COMMITS__________________________ +7734fb4ad98c3fdaf0fde82978ef8638195a5285 dm mpath: Interface for explicit probing of active paths +5c977f1023156938915c57d362fddde8fad2b052 dm-mpath: Don't grab work_mutex while probing paths +050a3e71ce24c6f18d70679d68056f76375ff51c dm mpath: replace spin_lock_irqsave with spin_lock_irq + +__CHANGES NOT IN UPSTREAM________________ +Porting to Rocky Linux 10, debranding and Rocky Linux branding' +Add partial riscv64 support for build root' +Provide basic VisionFive 2 support' +scsi: storvsc: Explicitly set max_segment_size to UINT_MAX diff --git a/configs/kernel-6.12.0-aarch64-64k-debug.config b/configs/kernel-6.12.0-aarch64-64k-debug.config index 67d28f4ed4c82..a39468e4904fd 100644 --- a/configs/kernel-6.12.0-aarch64-64k-debug.config +++ b/configs/kernel-6.12.0-aarch64-64k-debug.config @@ -12,8 +12,8 @@ CONFIG_AS_VERSION=25000 CONFIG_LD_IS_BFD=y CONFIG_LD_VERSION=25000 CONFIG_LLD_VERSION=0 -CONFIG_RUSTC_VERSION=0 -CONFIG_RUSTC_LLVM_VERSION=0 +CONFIG_RUSTC_VERSION=107600 +CONFIG_RUSTC_LLVM_VERSION=170006 CONFIG_CC_CAN_LINK=y CONFIG_CC_CAN_LINK_STATIC=y CONFIG_CC_HAS_ASM_GOTO_OUTPUT=y diff --git a/configs/kernel-6.12.0-aarch64-64k.config b/configs/kernel-6.12.0-aarch64-64k.config index cb7f1eaf2afd8..93289fc20f37d 100644 --- a/configs/kernel-6.12.0-aarch64-64k.config +++ b/configs/kernel-6.12.0-aarch64-64k.config @@ -12,8 +12,8 @@ CONFIG_AS_VERSION=25000 CONFIG_LD_IS_BFD=y CONFIG_LD_VERSION=25000 CONFIG_LLD_VERSION=0 -CONFIG_RUSTC_VERSION=0 -CONFIG_RUSTC_LLVM_VERSION=0 +CONFIG_RUSTC_VERSION=107600 +CONFIG_RUSTC_LLVM_VERSION=170006 CONFIG_CC_CAN_LINK=y CONFIG_CC_CAN_LINK_STATIC=y CONFIG_CC_HAS_ASM_GOTO_OUTPUT=y diff --git a/configs/kernel-6.12.0-aarch64-automotive-debug.config b/configs/kernel-6.12.0-aarch64-automotive-debug.config index f2a4b38de9347..75ef55c07e811 100644 --- a/configs/kernel-6.12.0-aarch64-automotive-debug.config +++ b/configs/kernel-6.12.0-aarch64-automotive-debug.config @@ -12,8 +12,8 @@ CONFIG_AS_VERSION=25000 CONFIG_LD_IS_BFD=y CONFIG_LD_VERSION=25000 CONFIG_LLD_VERSION=0 -CONFIG_RUSTC_VERSION=0 -CONFIG_RUSTC_LLVM_VERSION=0 +CONFIG_RUSTC_VERSION=107600 +CONFIG_RUSTC_LLVM_VERSION=170006 CONFIG_CC_CAN_LINK=y CONFIG_CC_CAN_LINK_STATIC=y CONFIG_CC_HAS_ASM_GOTO_OUTPUT=y diff --git a/configs/kernel-6.12.0-aarch64-automotive.config b/configs/kernel-6.12.0-aarch64-automotive.config index ac69e51588dfa..38b1136c11672 100644 --- a/configs/kernel-6.12.0-aarch64-automotive.config +++ b/configs/kernel-6.12.0-aarch64-automotive.config @@ -12,8 +12,8 @@ CONFIG_AS_VERSION=25000 CONFIG_LD_IS_BFD=y CONFIG_LD_VERSION=25000 CONFIG_LLD_VERSION=0 -CONFIG_RUSTC_VERSION=0 -CONFIG_RUSTC_LLVM_VERSION=0 +CONFIG_RUSTC_VERSION=107600 +CONFIG_RUSTC_LLVM_VERSION=170006 CONFIG_CC_CAN_LINK=y CONFIG_CC_CAN_LINK_STATIC=y CONFIG_CC_HAS_ASM_GOTO_OUTPUT=y diff --git a/configs/kernel-6.12.0-aarch64-debug.config b/configs/kernel-6.12.0-aarch64-debug.config index 2feabadbc4576..fbdfd945bf4d6 100644 --- a/configs/kernel-6.12.0-aarch64-debug.config +++ b/configs/kernel-6.12.0-aarch64-debug.config @@ -12,8 +12,8 @@ CONFIG_AS_VERSION=25000 CONFIG_LD_IS_BFD=y CONFIG_LD_VERSION=25000 CONFIG_LLD_VERSION=0 -CONFIG_RUSTC_VERSION=0 -CONFIG_RUSTC_LLVM_VERSION=0 +CONFIG_RUSTC_VERSION=107600 +CONFIG_RUSTC_LLVM_VERSION=170006 CONFIG_CC_CAN_LINK=y CONFIG_CC_CAN_LINK_STATIC=y CONFIG_CC_HAS_ASM_GOTO_OUTPUT=y diff --git a/configs/kernel-6.12.0-aarch64-rt-64k-debug.config b/configs/kernel-6.12.0-aarch64-rt-64k-debug.config index 96dc55cd746c8..e7301df51f965 100644 --- a/configs/kernel-6.12.0-aarch64-rt-64k-debug.config +++ b/configs/kernel-6.12.0-aarch64-rt-64k-debug.config @@ -12,8 +12,8 @@ CONFIG_AS_VERSION=25000 CONFIG_LD_IS_BFD=y CONFIG_LD_VERSION=25000 CONFIG_LLD_VERSION=0 -CONFIG_RUSTC_VERSION=0 -CONFIG_RUSTC_LLVM_VERSION=0 +CONFIG_RUSTC_VERSION=107600 +CONFIG_RUSTC_LLVM_VERSION=170006 CONFIG_CC_CAN_LINK=y CONFIG_CC_CAN_LINK_STATIC=y CONFIG_CC_HAS_ASM_GOTO_OUTPUT=y diff --git a/configs/kernel-6.12.0-aarch64-rt-64k.config b/configs/kernel-6.12.0-aarch64-rt-64k.config index 0990e023fe90e..68e86211d09ce 100644 --- a/configs/kernel-6.12.0-aarch64-rt-64k.config +++ b/configs/kernel-6.12.0-aarch64-rt-64k.config @@ -12,8 +12,8 @@ CONFIG_AS_VERSION=25000 CONFIG_LD_IS_BFD=y CONFIG_LD_VERSION=25000 CONFIG_LLD_VERSION=0 -CONFIG_RUSTC_VERSION=0 -CONFIG_RUSTC_LLVM_VERSION=0 +CONFIG_RUSTC_VERSION=107600 +CONFIG_RUSTC_LLVM_VERSION=170006 CONFIG_CC_CAN_LINK=y CONFIG_CC_CAN_LINK_STATIC=y CONFIG_CC_HAS_ASM_GOTO_OUTPUT=y diff --git a/configs/kernel-6.12.0-aarch64-rt-debug.config b/configs/kernel-6.12.0-aarch64-rt-debug.config index 57a9e4a8502b1..ed28883b98d5e 100644 --- a/configs/kernel-6.12.0-aarch64-rt-debug.config +++ b/configs/kernel-6.12.0-aarch64-rt-debug.config @@ -12,8 +12,8 @@ CONFIG_AS_VERSION=25000 CONFIG_LD_IS_BFD=y CONFIG_LD_VERSION=25000 CONFIG_LLD_VERSION=0 -CONFIG_RUSTC_VERSION=0 -CONFIG_RUSTC_LLVM_VERSION=0 +CONFIG_RUSTC_VERSION=107600 +CONFIG_RUSTC_LLVM_VERSION=170006 CONFIG_CC_CAN_LINK=y CONFIG_CC_CAN_LINK_STATIC=y CONFIG_CC_HAS_ASM_GOTO_OUTPUT=y diff --git a/configs/kernel-6.12.0-aarch64-rt.config b/configs/kernel-6.12.0-aarch64-rt.config index 2524f08ee0cdd..41c58e5e56697 100644 --- a/configs/kernel-6.12.0-aarch64-rt.config +++ b/configs/kernel-6.12.0-aarch64-rt.config @@ -12,8 +12,8 @@ CONFIG_AS_VERSION=25000 CONFIG_LD_IS_BFD=y CONFIG_LD_VERSION=25000 CONFIG_LLD_VERSION=0 -CONFIG_RUSTC_VERSION=0 -CONFIG_RUSTC_LLVM_VERSION=0 +CONFIG_RUSTC_VERSION=107600 +CONFIG_RUSTC_LLVM_VERSION=170006 CONFIG_CC_CAN_LINK=y CONFIG_CC_CAN_LINK_STATIC=y CONFIG_CC_HAS_ASM_GOTO_OUTPUT=y diff --git a/configs/kernel-6.12.0-aarch64.config b/configs/kernel-6.12.0-aarch64.config index f7626dcc7cd16..f29cb94f0bfcb 100644 --- a/configs/kernel-6.12.0-aarch64.config +++ b/configs/kernel-6.12.0-aarch64.config @@ -12,8 +12,8 @@ CONFIG_AS_VERSION=25000 CONFIG_LD_IS_BFD=y CONFIG_LD_VERSION=25000 CONFIG_LLD_VERSION=0 -CONFIG_RUSTC_VERSION=0 -CONFIG_RUSTC_LLVM_VERSION=0 +CONFIG_RUSTC_VERSION=107600 +CONFIG_RUSTC_LLVM_VERSION=170006 CONFIG_CC_CAN_LINK=y CONFIG_CC_CAN_LINK_STATIC=y CONFIG_CC_HAS_ASM_GOTO_OUTPUT=y diff --git a/configs/kernel-6.12.0-ppc64le-debug.config b/configs/kernel-6.12.0-ppc64le-debug.config index 9de0e98ea64bc..54772c9dbafc0 100644 --- a/configs/kernel-6.12.0-ppc64le-debug.config +++ b/configs/kernel-6.12.0-ppc64le-debug.config @@ -12,8 +12,8 @@ CONFIG_AS_VERSION=25000 CONFIG_LD_IS_BFD=y CONFIG_LD_VERSION=25000 CONFIG_LLD_VERSION=0 -CONFIG_RUSTC_VERSION=0 -CONFIG_RUSTC_LLVM_VERSION=0 +CONFIG_RUSTC_VERSION=107600 +CONFIG_RUSTC_LLVM_VERSION=170006 CONFIG_CC_CAN_LINK=y CONFIG_CC_CAN_LINK_STATIC=y CONFIG_CC_HAS_ASM_GOTO_OUTPUT=y diff --git a/configs/kernel-6.12.0-ppc64le.config b/configs/kernel-6.12.0-ppc64le.config index 845b8d369d638..d79df6728358d 100644 --- a/configs/kernel-6.12.0-ppc64le.config +++ b/configs/kernel-6.12.0-ppc64le.config @@ -12,8 +12,8 @@ CONFIG_AS_VERSION=25000 CONFIG_LD_IS_BFD=y CONFIG_LD_VERSION=25000 CONFIG_LLD_VERSION=0 -CONFIG_RUSTC_VERSION=0 -CONFIG_RUSTC_LLVM_VERSION=0 +CONFIG_RUSTC_VERSION=107600 +CONFIG_RUSTC_LLVM_VERSION=170006 CONFIG_CC_CAN_LINK=y CONFIG_CC_CAN_LINK_STATIC=y CONFIG_CC_HAS_ASM_GOTO_OUTPUT=y diff --git a/configs/kernel-6.12.0-riscv64-debug.config b/configs/kernel-6.12.0-riscv64-debug.config index a75241f27ca4c..bccb65d88d394 100644 --- a/configs/kernel-6.12.0-riscv64-debug.config +++ b/configs/kernel-6.12.0-riscv64-debug.config @@ -12,8 +12,8 @@ CONFIG_AS_VERSION=25000 CONFIG_LD_IS_BFD=y CONFIG_LD_VERSION=25000 CONFIG_LLD_VERSION=0 -CONFIG_RUSTC_VERSION=0 -CONFIG_RUSTC_LLVM_VERSION=0 +CONFIG_RUSTC_VERSION=107600 +CONFIG_RUSTC_LLVM_VERSION=170006 CONFIG_CC_CAN_LINK=y CONFIG_CC_CAN_LINK_STATIC=y CONFIG_CC_HAS_ASM_GOTO_OUTPUT=y diff --git a/configs/kernel-6.12.0-riscv64.config b/configs/kernel-6.12.0-riscv64.config index 1491475035b1f..945ebe7001eb6 100644 --- a/configs/kernel-6.12.0-riscv64.config +++ b/configs/kernel-6.12.0-riscv64.config @@ -12,8 +12,8 @@ CONFIG_AS_VERSION=25000 CONFIG_LD_IS_BFD=y CONFIG_LD_VERSION=25000 CONFIG_LLD_VERSION=0 -CONFIG_RUSTC_VERSION=0 -CONFIG_RUSTC_LLVM_VERSION=0 +CONFIG_RUSTC_VERSION=107600 +CONFIG_RUSTC_LLVM_VERSION=170006 CONFIG_CC_CAN_LINK=y CONFIG_CC_CAN_LINK_STATIC=y CONFIG_CC_HAS_ASM_GOTO_OUTPUT=y diff --git a/configs/kernel-6.12.0-s390x-debug.config b/configs/kernel-6.12.0-s390x-debug.config index 1d3388175b30a..f3e2692248e07 100644 --- a/configs/kernel-6.12.0-s390x-debug.config +++ b/configs/kernel-6.12.0-s390x-debug.config @@ -12,8 +12,8 @@ CONFIG_AS_VERSION=25000 CONFIG_LD_IS_BFD=y CONFIG_LD_VERSION=25000 CONFIG_LLD_VERSION=0 -CONFIG_RUSTC_VERSION=0 -CONFIG_RUSTC_LLVM_VERSION=0 +CONFIG_RUSTC_VERSION=107600 +CONFIG_RUSTC_LLVM_VERSION=170006 CONFIG_CC_CAN_LINK=y CONFIG_CC_CAN_LINK_STATIC=y CONFIG_CC_HAS_ASM_GOTO_OUTPUT=y diff --git a/configs/kernel-6.12.0-s390x-zfcpdump.config b/configs/kernel-6.12.0-s390x-zfcpdump.config index b63267eee0579..a7ab40e92fbe2 100644 --- a/configs/kernel-6.12.0-s390x-zfcpdump.config +++ b/configs/kernel-6.12.0-s390x-zfcpdump.config @@ -12,8 +12,8 @@ CONFIG_AS_VERSION=25000 CONFIG_LD_IS_BFD=y CONFIG_LD_VERSION=25000 CONFIG_LLD_VERSION=0 -CONFIG_RUSTC_VERSION=0 -CONFIG_RUSTC_LLVM_VERSION=0 +CONFIG_RUSTC_VERSION=107600 +CONFIG_RUSTC_LLVM_VERSION=170006 CONFIG_CC_CAN_LINK=y CONFIG_CC_CAN_LINK_STATIC=y CONFIG_CC_HAS_ASM_GOTO_OUTPUT=y diff --git a/configs/kernel-6.12.0-s390x.config b/configs/kernel-6.12.0-s390x.config index 2411abd20bf7b..a61fec87569aa 100644 --- a/configs/kernel-6.12.0-s390x.config +++ b/configs/kernel-6.12.0-s390x.config @@ -12,8 +12,8 @@ CONFIG_AS_VERSION=25000 CONFIG_LD_IS_BFD=y CONFIG_LD_VERSION=25000 CONFIG_LLD_VERSION=0 -CONFIG_RUSTC_VERSION=0 -CONFIG_RUSTC_LLVM_VERSION=0 +CONFIG_RUSTC_VERSION=107600 +CONFIG_RUSTC_LLVM_VERSION=170006 CONFIG_CC_CAN_LINK=y CONFIG_CC_CAN_LINK_STATIC=y CONFIG_CC_HAS_ASM_GOTO_OUTPUT=y diff --git a/configs/kernel-6.12.0-x86_64-automotive-debug.config b/configs/kernel-6.12.0-x86_64-automotive-debug.config index 23f8513cec3be..edd8261a83abe 100644 --- a/configs/kernel-6.12.0-x86_64-automotive-debug.config +++ b/configs/kernel-6.12.0-x86_64-automotive-debug.config @@ -12,8 +12,8 @@ CONFIG_AS_VERSION=25000 CONFIG_LD_IS_BFD=y CONFIG_LD_VERSION=25000 CONFIG_LLD_VERSION=0 -CONFIG_RUSTC_VERSION=0 -CONFIG_RUSTC_LLVM_VERSION=0 +CONFIG_RUSTC_VERSION=107600 +CONFIG_RUSTC_LLVM_VERSION=170006 CONFIG_CC_CAN_LINK=y CONFIG_CC_CAN_LINK_STATIC=y CONFIG_CC_HAS_ASM_GOTO_OUTPUT=y diff --git a/configs/kernel-6.12.0-x86_64-automotive.config b/configs/kernel-6.12.0-x86_64-automotive.config index 49b9d48c7c4b5..911a037e19cf4 100644 --- a/configs/kernel-6.12.0-x86_64-automotive.config +++ b/configs/kernel-6.12.0-x86_64-automotive.config @@ -12,8 +12,8 @@ CONFIG_AS_VERSION=25000 CONFIG_LD_IS_BFD=y CONFIG_LD_VERSION=25000 CONFIG_LLD_VERSION=0 -CONFIG_RUSTC_VERSION=0 -CONFIG_RUSTC_LLVM_VERSION=0 +CONFIG_RUSTC_VERSION=107600 +CONFIG_RUSTC_LLVM_VERSION=170006 CONFIG_CC_CAN_LINK=y CONFIG_CC_CAN_LINK_STATIC=y CONFIG_CC_HAS_ASM_GOTO_OUTPUT=y diff --git a/configs/kernel-6.12.0-x86_64-debug.config b/configs/kernel-6.12.0-x86_64-debug.config index 234f3a0f65495..b1c97090414b0 100644 --- a/configs/kernel-6.12.0-x86_64-debug.config +++ b/configs/kernel-6.12.0-x86_64-debug.config @@ -12,8 +12,8 @@ CONFIG_AS_VERSION=25000 CONFIG_LD_IS_BFD=y CONFIG_LD_VERSION=25000 CONFIG_LLD_VERSION=0 -CONFIG_RUSTC_VERSION=0 -CONFIG_RUSTC_LLVM_VERSION=0 +CONFIG_RUSTC_VERSION=107600 +CONFIG_RUSTC_LLVM_VERSION=170006 CONFIG_CC_CAN_LINK=y CONFIG_CC_CAN_LINK_STATIC=y CONFIG_CC_HAS_ASM_GOTO_OUTPUT=y diff --git a/configs/kernel-6.12.0-x86_64-rt-debug.config b/configs/kernel-6.12.0-x86_64-rt-debug.config index c088efd4130a6..63f1503be0529 100644 --- a/configs/kernel-6.12.0-x86_64-rt-debug.config +++ b/configs/kernel-6.12.0-x86_64-rt-debug.config @@ -12,8 +12,8 @@ CONFIG_AS_VERSION=25000 CONFIG_LD_IS_BFD=y CONFIG_LD_VERSION=25000 CONFIG_LLD_VERSION=0 -CONFIG_RUSTC_VERSION=0 -CONFIG_RUSTC_LLVM_VERSION=0 +CONFIG_RUSTC_VERSION=107600 +CONFIG_RUSTC_LLVM_VERSION=170006 CONFIG_CC_CAN_LINK=y CONFIG_CC_CAN_LINK_STATIC=y CONFIG_CC_HAS_ASM_GOTO_OUTPUT=y diff --git a/configs/kernel-6.12.0-x86_64-rt.config b/configs/kernel-6.12.0-x86_64-rt.config index 309800cfbc90f..2bf9f0ffc666a 100644 --- a/configs/kernel-6.12.0-x86_64-rt.config +++ b/configs/kernel-6.12.0-x86_64-rt.config @@ -12,8 +12,8 @@ CONFIG_AS_VERSION=25000 CONFIG_LD_IS_BFD=y CONFIG_LD_VERSION=25000 CONFIG_LLD_VERSION=0 -CONFIG_RUSTC_VERSION=0 -CONFIG_RUSTC_LLVM_VERSION=0 +CONFIG_RUSTC_VERSION=107600 +CONFIG_RUSTC_LLVM_VERSION=170006 CONFIG_CC_CAN_LINK=y CONFIG_CC_CAN_LINK_STATIC=y CONFIG_CC_HAS_ASM_GOTO_OUTPUT=y diff --git a/configs/kernel-6.12.0-x86_64.config b/configs/kernel-6.12.0-x86_64.config index bba96d6606abf..541b64fa412dd 100644 --- a/configs/kernel-6.12.0-x86_64.config +++ b/configs/kernel-6.12.0-x86_64.config @@ -12,8 +12,8 @@ CONFIG_AS_VERSION=25000 CONFIG_LD_IS_BFD=y CONFIG_LD_VERSION=25000 CONFIG_LLD_VERSION=0 -CONFIG_RUSTC_VERSION=0 -CONFIG_RUSTC_LLVM_VERSION=0 +CONFIG_RUSTC_VERSION=107600 +CONFIG_RUSTC_LLVM_VERSION=170006 CONFIG_CC_CAN_LINK=y CONFIG_CC_CAN_LINK_STATIC=y CONFIG_CC_HAS_ASM_GOTO_OUTPUT=y diff --git a/drivers/md/dm-ioctl.c b/drivers/md/dm-ioctl.c index f299ff393a6a2..d51088980bea7 100644 --- a/drivers/md/dm-ioctl.c +++ b/drivers/md/dm-ioctl.c @@ -1885,6 +1885,7 @@ static ioctl_fn lookup_ioctl(unsigned int cmd, int *ioctl_flags) {DM_DEV_SET_GEOMETRY_CMD, 0, dev_set_geometry}, {DM_DEV_ARM_POLL_CMD, IOCTL_FLAGS_NO_PARAMS, dev_arm_poll}, {DM_GET_TARGET_VERSION_CMD, 0, get_target_version}, + {DM_MPATH_PROBE_PATHS_CMD, 0, NULL}, /* block device ioctl */ }; if (unlikely(cmd >= ARRAY_SIZE(_ioctls))) diff --git a/drivers/md/dm-mpath.c b/drivers/md/dm-mpath.c index 368606afb6f05..dc741a2cf93e0 100644 --- a/drivers/md/dm-mpath.c +++ b/drivers/md/dm-mpath.c @@ -79,6 +79,7 @@ struct multipath { struct pgpath *current_pgpath; struct priority_group *current_pg; struct priority_group *next_pg; /* Switch to this PG if set */ + struct priority_group *last_probed_pg; atomic_t nr_valid_paths; /* Total number of usable paths */ unsigned int nr_priority_groups; @@ -87,6 +88,7 @@ struct multipath { const char *hw_handler_name; char *hw_handler_params; wait_queue_head_t pg_init_wait; /* Wait for pg_init completion */ + wait_queue_head_t probe_wait; /* Wait for probing paths */ unsigned int pg_init_retries; /* Number of times to retry pg_init */ unsigned int pg_init_delay_msecs; /* Number of msecs before pg_init retry */ atomic_t pg_init_in_progress; /* Only one pg_init allowed at once */ @@ -100,6 +102,7 @@ struct multipath { struct bio_list queued_bios; struct timer_list nopath_timer; /* Timeout for queue_if_no_path */ + bool is_suspending; }; /* @@ -132,6 +135,8 @@ static void queue_if_no_path_timeout_work(struct timer_list *t); #define MPATHF_PG_INIT_DISABLED 4 /* pg_init is not currently allowed */ #define MPATHF_PG_INIT_REQUIRED 5 /* pg_init needs calling? */ #define MPATHF_PG_INIT_DELAY_RETRY 6 /* Delay pg_init retry? */ +#define MPATHF_DELAY_PG_SWITCH 7 /* Delay switching pg if it still has paths */ +#define MPATHF_NEED_PG_SWITCH 8 /* Need to switch pgs after the delay has ended */ static bool mpath_double_check_test_bit(int MPATHF_bit, struct multipath *m) { @@ -254,6 +259,7 @@ static int alloc_multipath_stage2(struct dm_target *ti, struct multipath *m) atomic_set(&m->pg_init_count, 0); m->pg_init_delay_msecs = DM_PG_INIT_DELAY_DEFAULT; init_waitqueue_head(&m->pg_init_wait); + init_waitqueue_head(&m->probe_wait); return 0; } @@ -413,13 +419,21 @@ static struct pgpath *choose_pgpath(struct multipath *m, size_t nr_bytes) goto failed; } + /* Don't change PG until it has no remaining paths */ + pg = READ_ONCE(m->current_pg); + if (pg) { + pgpath = choose_path_in_pg(m, pg, nr_bytes); + if (!IS_ERR_OR_NULL(pgpath)) + return pgpath; + } + /* Were we instructed to switch PG? */ if (READ_ONCE(m->next_pg)) { spin_lock_irqsave(&m->lock, flags); pg = m->next_pg; if (!pg) { spin_unlock_irqrestore(&m->lock, flags); - goto check_current_pg; + goto check_all_pgs; } m->next_pg = NULL; spin_unlock_irqrestore(&m->lock, flags); @@ -427,16 +441,7 @@ static struct pgpath *choose_pgpath(struct multipath *m, size_t nr_bytes) if (!IS_ERR_OR_NULL(pgpath)) return pgpath; } - - /* Don't change PG until it has no remaining paths */ -check_current_pg: - pg = READ_ONCE(m->current_pg); - if (pg) { - pgpath = choose_path_in_pg(m, pg, nr_bytes); - if (!IS_ERR_OR_NULL(pgpath)) - return pgpath; - } - +check_all_pgs: /* * Loop through priority groups until we find a valid path. * First time we skip PGs marked 'bypassed'. @@ -612,7 +617,6 @@ static void multipath_queue_bio(struct multipath *m, struct bio *bio) static struct pgpath *__map_bio(struct multipath *m, struct bio *bio) { struct pgpath *pgpath; - unsigned long flags; /* Do we need to select a new pgpath? */ pgpath = READ_ONCE(m->current_pgpath); @@ -620,12 +624,12 @@ static struct pgpath *__map_bio(struct multipath *m, struct bio *bio) pgpath = choose_pgpath(m, bio->bi_iter.bi_size); if (!pgpath) { - spin_lock_irqsave(&m->lock, flags); + spin_lock_irq(&m->lock); if (test_bit(MPATHF_QUEUE_IF_NO_PATH, &m->flags)) { __multipath_queue_bio(m, bio); pgpath = ERR_PTR(-EAGAIN); } - spin_unlock_irqrestore(&m->lock, flags); + spin_unlock_irq(&m->lock); } else if (mpath_double_check_test_bit(MPATHF_QUEUE_IO, m) || mpath_double_check_test_bit(MPATHF_PG_INIT_REQUIRED, m)) { @@ -688,7 +692,6 @@ static void process_queued_io_list(struct multipath *m) static void process_queued_bios(struct work_struct *work) { int r; - unsigned long flags; struct bio *bio; struct bio_list bios; struct blk_plug plug; @@ -697,16 +700,16 @@ static void process_queued_bios(struct work_struct *work) bio_list_init(&bios); - spin_lock_irqsave(&m->lock, flags); + spin_lock_irq(&m->lock); if (bio_list_empty(&m->queued_bios)) { - spin_unlock_irqrestore(&m->lock, flags); + spin_unlock_irq(&m->lock); return; } bio_list_merge_init(&bios, &m->queued_bios); - spin_unlock_irqrestore(&m->lock, flags); + spin_unlock_irq(&m->lock); blk_start_plug(&plug); while ((bio = bio_list_pop(&bios))) { @@ -1190,7 +1193,6 @@ static int multipath_ctr(struct dm_target *ti, unsigned int argc, char **argv) struct dm_arg_set as; unsigned int pg_count = 0; unsigned int next_pg_num; - unsigned long flags; as.argc = argc; as.argv = argv; @@ -1255,9 +1257,9 @@ static int multipath_ctr(struct dm_target *ti, unsigned int argc, char **argv) goto bad; } - spin_lock_irqsave(&m->lock, flags); + spin_lock_irq(&m->lock); enable_nopath_timeout(m); - spin_unlock_irqrestore(&m->lock, flags); + spin_unlock_irq(&m->lock); ti->num_flush_bios = 1; ti->num_discard_bios = 1; @@ -1292,23 +1294,21 @@ static void multipath_wait_for_pg_init_completion(struct multipath *m) static void flush_multipath_work(struct multipath *m) { if (m->hw_handler_name) { - unsigned long flags; - if (!atomic_read(&m->pg_init_in_progress)) goto skip; - spin_lock_irqsave(&m->lock, flags); + spin_lock_irq(&m->lock); if (atomic_read(&m->pg_init_in_progress) && !test_and_set_bit(MPATHF_PG_INIT_DISABLED, &m->flags)) { - spin_unlock_irqrestore(&m->lock, flags); + spin_unlock_irq(&m->lock); flush_workqueue(kmpath_handlerd); multipath_wait_for_pg_init_completion(m); - spin_lock_irqsave(&m->lock, flags); + spin_lock_irq(&m->lock); clear_bit(MPATHF_PG_INIT_DISABLED, &m->flags); } - spin_unlock_irqrestore(&m->lock, flags); + spin_unlock_irq(&m->lock); } skip: if (m->queue_mode == DM_TYPE_BIO_BASED) @@ -1370,11 +1370,10 @@ static int fail_path(struct pgpath *pgpath) static int reinstate_path(struct pgpath *pgpath) { int r = 0, run_queue = 0; - unsigned long flags; struct multipath *m = pgpath->pg->m; unsigned int nr_valid_paths; - spin_lock_irqsave(&m->lock, flags); + spin_lock_irq(&m->lock); if (pgpath->is_active) goto out; @@ -1404,7 +1403,7 @@ static int reinstate_path(struct pgpath *pgpath) schedule_work(&m->trigger_event); out: - spin_unlock_irqrestore(&m->lock, flags); + spin_unlock_irq(&m->lock); if (run_queue) { dm_table_run_md_queue_async(m->ti->table); process_queued_io_list(m); @@ -1439,15 +1438,19 @@ static int action_dev(struct multipath *m, dev_t dev, action_fn action) * Temporarily try to avoid having to use the specified PG */ static void bypass_pg(struct multipath *m, struct priority_group *pg, - bool bypassed) + bool bypassed, bool can_be_delayed) { unsigned long flags; spin_lock_irqsave(&m->lock, flags); pg->bypassed = bypassed; - m->current_pgpath = NULL; - m->current_pg = NULL; + if (can_be_delayed && test_bit(MPATHF_DELAY_PG_SWITCH, &m->flags)) + set_bit(MPATHF_NEED_PG_SWITCH, &m->flags); + else { + m->current_pgpath = NULL; + m->current_pg = NULL; + } spin_unlock_irqrestore(&m->lock, flags); @@ -1461,7 +1464,6 @@ static int switch_pg_num(struct multipath *m, const char *pgstr) { struct priority_group *pg; unsigned int pgnum; - unsigned long flags; char dummy; if (!pgstr || (sscanf(pgstr, "%u%c", &pgnum, &dummy) != 1) || !pgnum || @@ -1470,17 +1472,21 @@ static int switch_pg_num(struct multipath *m, const char *pgstr) return -EINVAL; } - spin_lock_irqsave(&m->lock, flags); + spin_lock_irq(&m->lock); list_for_each_entry(pg, &m->priority_groups, list) { pg->bypassed = false; if (--pgnum) continue; - m->current_pgpath = NULL; - m->current_pg = NULL; + if (test_bit(MPATHF_DELAY_PG_SWITCH, &m->flags)) + set_bit(MPATHF_NEED_PG_SWITCH, &m->flags); + else { + m->current_pgpath = NULL; + m->current_pg = NULL; + } m->next_pg = pg; } - spin_unlock_irqrestore(&m->lock, flags); + spin_unlock_irq(&m->lock); schedule_work(&m->trigger_event); return 0; @@ -1507,7 +1513,7 @@ static int bypass_pg_num(struct multipath *m, const char *pgstr, bool bypassed) break; } - bypass_pg(m, pg, bypassed); + bypass_pg(m, pg, bypassed, true); return 0; } @@ -1561,7 +1567,7 @@ static void pg_init_done(void *data, int errors) * Probably doing something like FW upgrade on the * controller so try the other pg. */ - bypass_pg(m, pg, true); + bypass_pg(m, pg, true, false); break; case SCSI_DH_RETRY: /* Wait before retrying. */ @@ -1742,6 +1748,9 @@ static void multipath_presuspend(struct dm_target *ti) { struct multipath *m = ti->private; + spin_lock_irq(&m->lock); + m->is_suspending = true; + spin_unlock_irq(&m->lock); /* FIXME: bio-based shouldn't need to always disable queue_if_no_path */ if (m->queue_mode == DM_TYPE_BIO_BASED || !dm_noflush_suspending(m->ti)) queue_if_no_path(m, false, true, __func__); @@ -1762,9 +1771,9 @@ static void multipath_postsuspend(struct dm_target *ti) static void multipath_resume(struct dm_target *ti) { struct multipath *m = ti->private; - unsigned long flags; - spin_lock_irqsave(&m->lock, flags); + spin_lock_irq(&m->lock); + m->is_suspending = false; if (test_bit(MPATHF_SAVED_QUEUE_IF_NO_PATH, &m->flags)) { set_bit(MPATHF_QUEUE_IF_NO_PATH, &m->flags); clear_bit(MPATHF_SAVED_QUEUE_IF_NO_PATH, &m->flags); @@ -1775,7 +1784,7 @@ static void multipath_resume(struct dm_target *ti) test_bit(MPATHF_QUEUE_IF_NO_PATH, &m->flags), test_bit(MPATHF_SAVED_QUEUE_IF_NO_PATH, &m->flags)); - spin_unlock_irqrestore(&m->lock, flags); + spin_unlock_irq(&m->lock); } /* @@ -1798,14 +1807,13 @@ static void multipath_status(struct dm_target *ti, status_type_t type, unsigned int status_flags, char *result, unsigned int maxlen) { int sz = 0, pg_counter, pgpath_counter; - unsigned long flags; struct multipath *m = ti->private; struct priority_group *pg; struct pgpath *p; unsigned int pg_num; char state; - spin_lock_irqsave(&m->lock, flags); + spin_lock_irq(&m->lock); /* Features */ if (type == STATUSTYPE_INFO) @@ -1845,10 +1853,10 @@ static void multipath_status(struct dm_target *ti, status_type_t type, DMEMIT("%u ", m->nr_priority_groups); - if (m->next_pg) - pg_num = m->next_pg->pg_num; - else if (m->current_pg) + if (m->current_pg) pg_num = m->current_pg->pg_num; + else if (m->next_pg) + pg_num = m->next_pg->pg_num; else pg_num = (m->nr_priority_groups ? 1 : 0); @@ -1951,7 +1959,7 @@ static void multipath_status(struct dm_target *ti, status_type_t type, break; } - spin_unlock_irqrestore(&m->lock, flags); + spin_unlock_irq(&m->lock); } static int multipath_message(struct dm_target *ti, unsigned int argc, char **argv, @@ -1961,7 +1969,6 @@ static int multipath_message(struct dm_target *ti, unsigned int argc, char **arg dev_t dev; struct multipath *m = ti->private; action_fn action; - unsigned long flags; mutex_lock(&m->work_mutex); @@ -1973,9 +1980,9 @@ static int multipath_message(struct dm_target *ti, unsigned int argc, char **arg if (argc == 1) { if (!strcasecmp(argv[0], "queue_if_no_path")) { r = queue_if_no_path(m, true, false, __func__); - spin_lock_irqsave(&m->lock, flags); + spin_lock_irq(&m->lock); enable_nopath_timeout(m); - spin_unlock_irqrestore(&m->lock, flags); + spin_unlock_irq(&m->lock); goto out; } else if (!strcasecmp(argv[0], "fail_if_no_path")) { r = queue_if_no_path(m, false, false, __func__); @@ -2021,6 +2028,113 @@ static int multipath_message(struct dm_target *ti, unsigned int argc, char **arg return r; } +/* + * Perform a minimal read from the given path to find out whether the + * path still works. If a path error occurs, fail it. + */ +static int probe_path(struct pgpath *pgpath) +{ + struct block_device *bdev = pgpath->path.dev->bdev; + unsigned int read_size = bdev_logical_block_size(bdev); + struct page *page; + struct bio *bio; + blk_status_t status; + int r = 0; + + if (WARN_ON_ONCE(read_size > PAGE_SIZE)) + return -EINVAL; + + page = alloc_page(GFP_KERNEL); + if (!page) + return -ENOMEM; + + /* Perform a minimal read: Sector 0, length read_size */ + bio = bio_alloc(bdev, 1, REQ_OP_READ, GFP_KERNEL); + if (!bio) { + r = -ENOMEM; + goto out; + } + + bio->bi_iter.bi_sector = 0; + __bio_add_page(bio, page, read_size, 0); + submit_bio_wait(bio); + status = bio->bi_status; + bio_put(bio); + + if (status && blk_path_error(status)) + fail_path(pgpath); + +out: + __free_page(page); + return r; +} + +/* + * Probe all active paths in current_pg to find out whether they still work. + * Fail all paths that do not work. + * + * Return -ENOTCONN if no valid path is left (even outside of current_pg). We + * cannot probe paths in other pgs without switching current_pg, so if valid + * paths are only in different pgs, they may or may not work. Additionally + * we should not probe paths in a pathgroup that is in the process of + * Initializing. Userspace can submit a request and we'll switch and wait + * for the pathgroup to be initialized. If the request fails, it may need to + * probe again. + */ +static int probe_active_paths(struct multipath *m) +{ + struct pgpath *pgpath; + struct priority_group *pg = NULL; + int r = 0; + + spin_lock_irq(&m->lock); + if (test_bit(MPATHF_DELAY_PG_SWITCH, &m->flags)) { + wait_event_lock_irq(m->probe_wait, + !test_bit(MPATHF_DELAY_PG_SWITCH, &m->flags), + m->lock); + /* + * if we waited because a probe was already in progress, + * and it probed the current active pathgroup, don't + * reprobe. Just return the number of valid paths + */ + if (m->current_pg == m->last_probed_pg) + goto skip_probe; + } + if (!m->current_pg || m->is_suspending || + test_bit(MPATHF_QUEUE_IO, &m->flags)) + goto skip_probe; + set_bit(MPATHF_DELAY_PG_SWITCH, &m->flags); + pg = m->last_probed_pg = m->current_pg; + spin_unlock_irq(&m->lock); + + list_for_each_entry(pgpath, &pg->pgpaths, list) { + if (pg != READ_ONCE(m->current_pg) || + READ_ONCE(m->is_suspending)) + goto out; + if (!pgpath->is_active) + continue; + + r = probe_path(pgpath); + if (r < 0) + goto out; + } + +out: + spin_lock_irq(&m->lock); + clear_bit(MPATHF_DELAY_PG_SWITCH, &m->flags); + if (test_and_clear_bit(MPATHF_NEED_PG_SWITCH, &m->flags)) { + m->current_pgpath = NULL; + m->current_pg = NULL; + } +skip_probe: + if (r == 0 && !atomic_read(&m->nr_valid_paths)) + r = -ENOTCONN; + spin_unlock_irq(&m->lock); + if (pg) + wake_up(&m->probe_wait); + return r; +} + static int multipath_prepare_ioctl(struct dm_target *ti, struct block_device **bdev, unsigned int cmd, unsigned long arg, @@ -2028,9 +2142,18 @@ static int multipath_prepare_ioctl(struct dm_target *ti, { struct multipath *m = ti->private; struct pgpath *pgpath; - unsigned long flags; int r; + if (_IOC_TYPE(cmd) == DM_IOCTL) { + *forward = false; + switch (cmd) { + case DM_MPATH_PROBE_PATHS: + return probe_active_paths(m); + default: + return -ENOTTY; + } + } + pgpath = READ_ONCE(m->current_pgpath); if (!pgpath || !mpath_double_check_test_bit(MPATHF_QUEUE_IO, m)) pgpath = choose_pgpath(m, 0); @@ -2046,10 +2169,10 @@ static int multipath_prepare_ioctl(struct dm_target *ti, } else { /* No path is available */ r = -EIO; - spin_lock_irqsave(&m->lock, flags); + spin_lock_irq(&m->lock); if (test_bit(MPATHF_QUEUE_IF_NO_PATH, &m->flags)) r = -ENOTCONN; - spin_unlock_irqrestore(&m->lock, flags); + spin_unlock_irq(&m->lock); } if (r == -ENOTCONN) { @@ -2057,10 +2180,10 @@ static int multipath_prepare_ioctl(struct dm_target *ti, /* Path status changed, redo selection */ (void) choose_pgpath(m, 0); } - spin_lock_irqsave(&m->lock, flags); + spin_lock_irq(&m->lock); if (test_bit(MPATHF_PG_INIT_REQUIRED, &m->flags)) (void) __pg_init_all_paths(m); - spin_unlock_irqrestore(&m->lock, flags); + spin_unlock_irq(&m->lock); dm_table_run_md_queue_async(m->ti->table); process_queued_io_list(m); } @@ -2182,7 +2305,7 @@ static int multipath_busy(struct dm_target *ti) */ static struct target_type multipath_target = { .name = "multipath", - .version = {1, 14, 0}, + .version = {1, 15, 0}, .features = DM_TARGET_SINGLETON | DM_TARGET_IMMUTABLE | DM_TARGET_PASSES_INTEGRITY, .module = THIS_MODULE, diff --git a/drivers/scsi/storvsc_drv.c b/drivers/scsi/storvsc_drv.c index 6c89b486b7122..d75606ebf1930 100644 --- a/drivers/scsi/storvsc_drv.c +++ b/drivers/scsi/storvsc_drv.c @@ -1890,6 +1890,7 @@ static struct scsi_host_template scsi_driver = { .no_write_same = 1, .track_queue_depth = 1, .change_queue_depth = storvsc_change_queue_depth, + .max_segment_size = 0xffffffff, }; enum { diff --git a/include/linux/mm.h b/include/linux/mm.h index 5478c1ca5050f..f321e86144dcc 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -369,6 +369,64 @@ extern unsigned int kobjsize(const void *objp); # define VM_SHADOW_STACK VM_NONE #endif +/* + * DMA mapping IDs for page_pool + * + * When DMA-mapping a page, page_pool allocates an ID (from an xarray) and + * stashes it in the upper bits of page->pp_magic. We always want to be able to + * unambiguously identify page pool pages (using page_pool_page_is_pp()). Non-PP + * pages can have arbitrary kernel pointers stored in the same field as pp_magic + * (since it overlaps with page->lru.next), so we must ensure that we cannot + * mistake a valid kernel pointer with any of the values we write into this + * field. + * + * On architectures that set POISON_POINTER_DELTA, this is already ensured, + * since this value becomes part of PP_SIGNATURE; meaning we can just use the + * space between the PP_SIGNATURE value (without POISON_POINTER_DELTA), and the + * lowest bits of POISON_POINTER_DELTA. On arches where POISON_POINTER_DELTA is + * 0, we make sure that we leave the two topmost bits empty, as that guarantees + * we won't mistake a valid kernel pointer for a value we set, regardless of the + * VMSPLIT setting. + * + * Altogether, this means that the number of bits available is constrained by + * the size of an unsigned long (at the upper end, subtracting two bits per the + * above), and the definition of PP_SIGNATURE (with or without + * POISON_POINTER_DELTA). + */ +#define PP_DMA_INDEX_SHIFT (1 + __fls(PP_SIGNATURE - POISON_POINTER_DELTA)) +#if POISON_POINTER_DELTA > 0 +/* PP_SIGNATURE includes POISON_POINTER_DELTA, so limit the size of the DMA + * index to not overlap with that if set + */ +#define PP_DMA_INDEX_BITS MIN(32, __ffs(POISON_POINTER_DELTA) - PP_DMA_INDEX_SHIFT) +#else +/* Always leave out the topmost two; see above. */ +#define PP_DMA_INDEX_BITS MIN(32, BITS_PER_LONG - PP_DMA_INDEX_SHIFT - 2) +#endif + +#define PP_DMA_INDEX_MASK GENMASK(PP_DMA_INDEX_BITS + PP_DMA_INDEX_SHIFT - 1, \ + PP_DMA_INDEX_SHIFT) + +/* Mask used for checking in page_pool_page_is_pp() below. page->pp_magic is + * OR'ed with PP_SIGNATURE after the allocation in order to preserve bit 0 for + * the head page of compound page and bit 1 for pfmemalloc page, as well as the + * bits used for the DMA index. page_is_pfmemalloc() is checked in + * __page_pool_put_page() to avoid recycling the pfmemalloc page. + */ +#define PP_MAGIC_MASK ~(PP_DMA_INDEX_MASK | 0x3UL) + +#ifdef CONFIG_PAGE_POOL +static inline bool page_pool_page_is_pp(struct page *page) +{ + return (page->pp_magic & PP_MAGIC_MASK) == PP_SIGNATURE; +} +#else +static inline bool page_pool_page_is_pp(struct page *page) +{ + return false; +} +#endif + #if defined(CONFIG_X86) # define VM_PAT VM_ARCH_1 /* PAT reserves whole VMA at once (x86) */ #elif defined(CONFIG_PPC64) @@ -4224,62 +4282,4 @@ static inline void pgalloc_tag_copy(struct folio *new, struct folio *old) } #endif /* CONFIG_MEM_ALLOC_PROFILING */ -/* - * DMA mapping IDs for page_pool - * - * When DMA-mapping a page, page_pool allocates an ID (from an xarray) and - * stashes it in the upper bits of page->pp_magic. We always want to be able to - * unambiguously identify page pool pages (using page_pool_page_is_pp()). Non-PP - * pages can have arbitrary kernel pointers stored in the same field as pp_magic - * (since it overlaps with page->lru.next), so we must ensure that we cannot - * mistake a valid kernel pointer with any of the values we write into this - * field. - * - * On architectures that set POISON_POINTER_DELTA, this is already ensured, - * since this value becomes part of PP_SIGNATURE; meaning we can just use the - * space between the PP_SIGNATURE value (without POISON_POINTER_DELTA), and the - * lowest bits of POISON_POINTER_DELTA. On arches where POISON_POINTER_DELTA is - * 0, we make sure that we leave the two topmost bits empty, as that guarantees - * we won't mistake a valid kernel pointer for a value we set, regardless of the - * VMSPLIT setting. - * - * Altogether, this means that the number of bits available is constrained by - * the size of an unsigned long (at the upper end, subtracting two bits per the - * above), and the definition of PP_SIGNATURE (with or without - * POISON_POINTER_DELTA). - */ -#define PP_DMA_INDEX_SHIFT (1 + __fls(PP_SIGNATURE - POISON_POINTER_DELTA)) -#if POISON_POINTER_DELTA > 0 -/* PP_SIGNATURE includes POISON_POINTER_DELTA, so limit the size of the DMA - * index to not overlap with that if set - */ -#define PP_DMA_INDEX_BITS MIN(32, __ffs(POISON_POINTER_DELTA) - PP_DMA_INDEX_SHIFT) -#else -/* Always leave out the topmost two; see above. */ -#define PP_DMA_INDEX_BITS MIN(32, BITS_PER_LONG - PP_DMA_INDEX_SHIFT - 2) -#endif - -#define PP_DMA_INDEX_MASK GENMASK(PP_DMA_INDEX_BITS + PP_DMA_INDEX_SHIFT - 1, \ - PP_DMA_INDEX_SHIFT) - -/* Mask used for checking in page_pool_page_is_pp() below. page->pp_magic is - * OR'ed with PP_SIGNATURE after the allocation in order to preserve bit 0 for - * the head page of compound page and bit 1 for pfmemalloc page, as well as the - * bits used for the DMA index. page_is_pfmemalloc() is checked in - * __page_pool_put_page() to avoid recycling the pfmemalloc page. - */ -#define PP_MAGIC_MASK ~(PP_DMA_INDEX_MASK | 0x3UL) - -#ifdef CONFIG_PAGE_POOL -static inline bool page_pool_page_is_pp(struct page *page) -{ - return (page->pp_magic & PP_MAGIC_MASK) == PP_SIGNATURE; -} -#else -static inline bool page_pool_page_is_pp(struct page *page) -{ - return false; -} -#endif - #endif /* _LINUX_MM_H */ diff --git a/include/net/page_pool/types.h b/include/net/page_pool/types.h index f53e2c90b6866..e4490076dfc4f 100644 --- a/include/net/page_pool/types.h +++ b/include/net/page_pool/types.h @@ -190,6 +190,8 @@ struct page_pool { #endif u32 xdp_mem_id; + RH_KABI_FILL_HOLE(struct xarray dma_mapped) + /* * Data structure for allocation side * @@ -220,8 +222,6 @@ struct page_pool { void *mp_priv; - struct xarray dma_mapped; - #ifdef CONFIG_PAGE_POOL_STATS /* recycle stats are per-cpu to avoid locking */ struct page_pool_recycle_stats __percpu *recycle_stats; diff --git a/include/uapi/linux/dm-ioctl.h b/include/uapi/linux/dm-ioctl.h index 1990b5700f694..89ec2afeea2b2 100644 --- a/include/uapi/linux/dm-ioctl.h +++ b/include/uapi/linux/dm-ioctl.h @@ -258,10 +258,12 @@ enum { DM_DEV_SET_GEOMETRY_CMD, DM_DEV_ARM_POLL_CMD, DM_GET_TARGET_VERSION_CMD, + DM_MPATH_PROBE_PATHS_CMD, }; #define DM_IOCTL 0xfd +/* Control device ioctls */ #define DM_VERSION _IOWR(DM_IOCTL, DM_VERSION_CMD, struct dm_ioctl) #define DM_REMOVE_ALL _IOWR(DM_IOCTL, DM_REMOVE_ALL_CMD, struct dm_ioctl) #define DM_LIST_DEVICES _IOWR(DM_IOCTL, DM_LIST_DEVICES_CMD, struct dm_ioctl) @@ -285,10 +287,13 @@ enum { #define DM_TARGET_MSG _IOWR(DM_IOCTL, DM_TARGET_MSG_CMD, struct dm_ioctl) #define DM_DEV_SET_GEOMETRY _IOWR(DM_IOCTL, DM_DEV_SET_GEOMETRY_CMD, struct dm_ioctl) +/* Block device ioctls */ +#define DM_MPATH_PROBE_PATHS _IO(DM_IOCTL, DM_MPATH_PROBE_PATHS_CMD) + #define DM_VERSION_MAJOR 4 #define DM_VERSION_MINOR 48 -#define DM_VERSION_PATCHLEVEL 0 -#define DM_VERSION_EXTRA "-ioctl (2023-03-01)" +#define DM_VERSION_PATCHLEVEL 1 +#define DM_VERSION_EXTRA "-ioctl (2025-04-28)" /* Status bits */ #define DM_READONLY_FLAG (1 << 0) /* In/Out */ diff --git a/redhat/kernel.changelog-10.0 b/redhat/kernel.changelog-10.0 index f852dc17f5780..164ae91885b1f 100644 --- a/redhat/kernel.changelog-10.0 +++ b/redhat/kernel.changelog-10.0 @@ -1,3 +1,17 @@ +* Mon Jun 30 2025 Jan Stancek [6.12.0-55.20.1.el10_0] +- x86/microcode/AMD: Fix out-of-bounds on systems with CPU-less NUMA nodes (CKI Backport Bot) [RHEL-99007] {CVE-2025-21991} +- page_pool: Track DMA-mapped pages and unmap them when destroying the pool (Toke Høiland-Jørgensen) [RHEL-84148] +- page_pool: Move pp_magic check into helper functions (Toke Høiland-Jørgensen) [RHEL-84148] +- scsi: storvsc: Explicitly set max_segment_size to UINT_MAX (Ewan D. Milne) [RHEL-97172] +- vmxnet3: Fix malformed packet sizing in vmxnet3_process_xdp (CKI Backport Bot) [RHEL-97116] {CVE-2025-37799} +- dm mpath: replace spin_lock_irqsave with spin_lock_irq (Benjamin Marzinski) [RHEL-89484] +- dm-mpath: Don't grab work_mutex while probing paths (Benjamin Marzinski) [RHEL-89484] +- dm mpath: Interface for explicit probing of active paths (Benjamin Marzinski) [RHEL-89484] +- dm: Allow .prepare_ioctl to handle ioctls directly (Benjamin Marzinski) [RHEL-89484] +- ipv6: mcast: extend RCU protection in igmp6_send() (CKI Backport Bot) [RHEL-94685] {CVE-2025-21759} +- net: add dev_net_rcu() helper (Hangbin Liu) [RHEL-94685] +Resolves: RHEL-84148, RHEL-89484, RHEL-94685, RHEL-97116, RHEL-97172, RHEL-99007 + * Sun Jun 22 2025 Jan Stancek [6.12.0-55.19.1.el10_0] - ibmvnic: Use kernel helpers for hex dumps (CKI Backport Bot) [RHEL-89031] {CVE-2025-22104} - eth: bnxt: fix truesize for mb-xdp-pass case (CKI Backport Bot) [RHEL-88329] {CVE-2025-21961}