Skip to content

[fromtree] bluetooth: fix bug when destroying tx buffers on disconnected #3026

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Open
wants to merge 6 commits into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
16 changes: 11 additions & 5 deletions subsys/bluetooth/host/classic/l2cap_br.c
Original file line number Diff line number Diff line change
Expand Up @@ -1574,6 +1574,8 @@ struct net_buf *l2cap_br_data_pull(struct bt_conn *conn, size_t amount, size_t *
return NULL;
}

__ASSERT_NO_MSG(conn->state == BT_CONN_CONNECTED);

struct bt_l2cap_br_chan *br_chan;

br_chan = CONTAINER_OF(pdu_ready, struct bt_l2cap_br_chan, _pdu_ready);
Expand All @@ -1591,27 +1593,31 @@ struct net_buf *l2cap_br_data_pull(struct bt_conn *conn, size_t amount, size_t *

__ASSERT(tx_pdu, "signaled ready but no PDUs in the TX queue");

struct net_buf *pdu = CONTAINER_OF(tx_pdu, struct net_buf, node);
struct net_buf *q_pdu = CONTAINER_OF(tx_pdu, struct net_buf, node);

if (bt_buf_has_view(pdu)) {
LOG_ERR("already have view on %p", pdu);
if (bt_buf_has_view(q_pdu)) {
LOG_ERR("already have view on %p", q_pdu);
return NULL;
}

struct net_buf *pdu = net_buf_ref(q_pdu);

/* We can't interleave ACL fragments from different channels for the
* same ACL conn -> we have to wait until a full L2 PDU is transferred
* before switching channels.
*/
bool last_frag = amount >= pdu->len;

if (last_frag) {
LOG_DBG("last frag, removing %p", pdu);
LOG_DBG("last frag, removing %p", q_pdu);
__maybe_unused bool found;

found = sys_slist_find_and_remove(&br_chan->_pdu_tx_queue, &pdu->node);
found = sys_slist_find_and_remove(&br_chan->_pdu_tx_queue, &q_pdu->node);

__ASSERT_NO_MSG(found);

net_buf_unref(q_pdu);

LOG_DBG("chan %p done", br_chan);
lower_data_ready(br_chan);

Expand Down
132 changes: 52 additions & 80 deletions subsys/bluetooth/host/conn.c
Original file line number Diff line number Diff line change
Expand Up @@ -649,8 +649,8 @@
}

static int send_buf(struct bt_conn *conn, struct net_buf *buf,
size_t len, void *cb, void *ud)
size_t len, bt_conn_tx_cb_t cb, void *ud)
{

Check notice on line 653 in subsys/bluetooth/host/conn.c

View workflow job for this annotation

GitHub Actions / Run compliance checks on patch series (PR)

You may want to run clang-format on this change

subsys/bluetooth/host/conn.c:653 -static int send_buf(struct bt_conn *conn, struct net_buf *buf, - size_t len, bt_conn_tx_cb_t cb, void *ud) +static int send_buf(struct bt_conn *conn, struct net_buf *buf, size_t len, bt_conn_tx_cb_t cb, + void *ud)
struct net_buf *frag = NULL;
struct bt_conn_tx *tx = NULL;
uint8_t flags;
Expand All @@ -659,13 +659,15 @@
if (buf->len == 0) {
__ASSERT_NO_MSG(0);

return -EMSGSIZE;
err = -EMSGSIZE;
goto error_return;
}

if (bt_buf_has_view(buf)) {
__ASSERT_NO_MSG(0);

return -EIO;
err = -EIO;
goto error_return;
}

LOG_DBG("conn %p buf %p len %zu buf->len %u cb %p ud %p",
Expand All @@ -680,7 +682,8 @@
*/
__ASSERT(0, "No controller bufs");

return -ENOMEM;
err = -ENOMEM;
goto error_return;
}

/* Allocate and set the TX context */
Expand All @@ -689,37 +692,42 @@
/* See big comment above */
if (!tx) {
__ASSERT(0, "No TX context");

return -ENOMEM;
k_sem_give(bt_conn_get_pkts(conn));
err = -ENOMEM;
goto error_return;
}

tx->cb = cb;
tx->user_data = ud;

uint16_t frag_len = MIN(conn_mtu(conn), len);

/* If ATT sent callback is delayed until data transmission is done by BLE controller, the
* transmitted buffer may have an additional reference. The reference is used to extend
* lifetime of the net buffer until the data transmission is confirmed by ACK of the remote.
/* If ATT sent callback is delayed until data transmission
Copy link
Contributor

@KyraLengfeld KyraLengfeld Jul 10, 2025

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Nitpick: "letter count" (forgot proper word) in one line had been increased. So better to go with the new one. Probably you can just have two long lines and run clang-format.

* is done by BLE controller, the transmitted buffer may
* have an additional reference. The reference is used to
* extend lifetime of the net buffer until the data
* transmission is confirmed by ACK of the remote.
*
* send_buf function can be called multiple times, if buffer
* has to be fragmented over HCI. In that case, the callback
* is provided as an argument only for the last transmitted
* fragment. The `buf->ref == 1` (or 2) check is skipped
* because it's impossible to properly validate number of
* references for the sent fragments if buffers may have the
* additional reference.
*
* send_buf function can be called multiple times, if buffer has to be fragmented over HCI.
* In that case, the callback is provided as an argument only for the last transmitted
* fragment. The `buf->ref == 1` check is skipped because it's impossible to properly
* validate number of references for the sent fragments if buffers may have the additional
* reference.
* Otherwise, check that buf->ref is 1 or 2. It would be 1
* if this was the only reference (e.g. buf was removed from
* the conn tx_queue). It would be 2 if the tx_data_pull
* kept it on the tx_queue for segmentation.
*/
__ASSERT_NO_MSG(IS_ENABLED(CONFIG_BT_ATT_SENT_CB_AFTER_TX) || (buf->ref == 1));
__ASSERT_NO_MSG(IS_ENABLED(CONFIG_BT_ATT_SENT_CB_AFTER_TX) || (buf->ref == 1) ||
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Can we squash the 2 noups? This one reverts part of the first noup

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Discussed offline with @alwa-nordic . Will not squash in scope of this PR.

(buf->ref == 2));

if (buf->len > frag_len) {
LOG_DBG("keep %p around", buf);
frag = get_data_frag(net_buf_ref(buf), frag_len);
} else {
LOG_DBG("move %p ref in", buf);
/* Move the ref into `frag` for the last TX. That way `buf` will
* get destroyed when `frag` is destroyed.
*/
frag = get_data_frag(buf, frag_len);
}
/* The reference is always transferred to the frag, so when
* the frag is destroyed, the parent reference is decremented.
*/
frag = get_data_frag(buf, frag_len);

/* Caller is supposed to check we have all resources to send */
__ASSERT_NO_MSG(frag != NULL);
Expand All @@ -733,7 +741,7 @@
conn->next_is_frag = false;
}

LOG_DBG("send frag: buf %p len %d", buf, frag_len);
LOG_DBG("send frag: buf %p len %d", frag, frag_len);

/* At this point, the buffer is either a fragment or a full HCI packet.
* The flags are also valid.
Expand Down Expand Up @@ -776,15 +784,26 @@
*/
net_buf_unref(frag);

/* `buf` might not get destroyed right away, and its `tx`
* pointer will still be reachable. Make sure that we don't try
* to use the destroyed context later.
/* `buf` might not get destroyed right away because it may
* still be on a conn tx_queue, and its `tx` pointer will still
* be reachable. Make sure that we don't try to use the
* destroyed context later.
*/
conn_tx_destroy(conn, tx);
k_sem_give(bt_conn_get_pkts(conn));

/* Merge HCI driver errors */
return -EIO;

error_return:
/* Runtime handling of fatal errors when ASSERTS are disabled.
* Unref the buf and invoke callback with the error.
*/
net_buf_unref(buf);
if (cb) {
cb(conn, ud, err);
}
return err;
}

static struct k_poll_signal conn_change =
Expand Down Expand Up @@ -966,8 +985,8 @@
sys_slist_remove(&bt_dev.le.conn_ready, prev, &conn->_conn_ready);
(void)atomic_set(&conn->_conn_ready_lock, 0);

/* Append connection to list if it still has data */
if (conn->has_data(conn)) {
/* Append connection to list if it is connected and still has data */
if (conn->has_data(conn) && (conn->state == BT_CONN_CONNECTED)) {
LOG_DBG("appending %p to back of TX queue", conn);
bt_conn_data_ready(conn);
}
Expand Down Expand Up @@ -995,30 +1014,6 @@
}
#endif /* defined(CONFIG_BT_CONN) */

/* Acts as a "null-routed" bt_send(). This fn will decrease the refcount of
* `buf` and call the user callback with an error code.
*/
static void destroy_and_callback(struct bt_conn *conn,
struct net_buf *buf,
bt_conn_tx_cb_t cb,
void *ud)
{
if (!cb) {
conn->get_and_clear_cb(conn, buf, &cb, &ud);
}

LOG_DBG("pop: cb %p userdata %p", cb, ud);

/* bt_send() would've done an unref. Do it here also, so the buffer is
* hopefully destroyed and the user callback can allocate a new one.
*/
net_buf_unref(buf);

if (cb) {
cb(conn, ud, -ESHUTDOWN);
}
}

static volatile bool _suspend_tx;

#if defined(CONFIG_BT_TESTING)
Expand Down Expand Up @@ -1061,17 +1056,7 @@

if (conn->state != BT_CONN_CONNECTED) {
LOG_WRN("conn %p: not connected", conn);

/* Call the user callbacks & destroy (final-unref) the buffers
* we were supposed to send.
*/
buf = conn->tx_data_pull(conn, SIZE_MAX, &buf_len);
while (buf) {
destroy_and_callback(conn, buf, cb, ud);
buf = conn->tx_data_pull(conn, SIZE_MAX, &buf_len);
}

goto exit;
goto raise_and_exit;
}

/* now that we are guaranteed resources, we can pull data from the upper
Expand Down Expand Up @@ -1105,25 +1090,12 @@
int err = send_buf(conn, buf, buf_len, cb, ud);

if (err) {
/* -EIO means `unrecoverable error`. It can be an assertion that
* failed or an error from the HCI driver.
*
* -ENOMEM means we thought we had all the resources to send the
* buf (ie. TX context + controller buffer) but one of them was
* not available. This is likely due to a failure of
* assumption, likely that we have been pre-empted somehow and
* that `tx_processor()` has been re-entered.
*
* In both cases, we destroy the buffer and mark the connection
* as dead.
*/
LOG_ERR("Fatal error (%d). Disconnecting %p", err, conn);
destroy_and_callback(conn, buf, cb, ud);
bt_conn_disconnect(conn, BT_HCI_ERR_REMOTE_USER_TERM_CONN);

goto exit;
}

raise_and_exit:
/* Always kick the TX work. It will self-suspend if it doesn't get
* resources or there is nothing left to send.
*/
Expand Down
14 changes: 13 additions & 1 deletion subsys/bluetooth/host/conn_internal.h
Original file line number Diff line number Diff line change
Expand Up @@ -288,10 +288,22 @@ struct bt_conn {
#endif

/* Callback into the higher-layers (L2CAP / ISO) to return a buffer for
* sending `amount` of bytes to HCI.
* sending `amount` of bytes to HCI. Will only be called when
* the state is connected. The higher-layer is responsible for purging
* the remaining buffers on disconnect.
*
* Scheduling from which channel to pull (e.g. for L2CAP) is done at the
* upper layer's discretion.
*
* Details about the returned net_buf when it is not NULL:
* - If the net_buf->len <= *length, then the net_buf has been removed
* from the tx_queue of the connection and the caller is now the
* owner of the only reference to the net_buf.
* - Otherwise, the net_buf is still on the tx_queue of the connection,
* and the callback has incremented the reference count to account
* for it having a reference still.
* - The caller must consume *length bytes from the net_buf before
* calling this function again.
*/
struct net_buf * (*tx_data_pull)(struct bt_conn *conn,
size_t amount,
Expand Down
31 changes: 20 additions & 11 deletions subsys/bluetooth/host/iso.c
Original file line number Diff line number Diff line change
Expand Up @@ -454,10 +454,18 @@ void bt_iso_connected(struct bt_conn *iso)
static void bt_iso_chan_disconnected(struct bt_iso_chan *chan, uint8_t reason)
{
const uint8_t conn_type = chan->iso->iso.info.type;
struct net_buf *buf;

LOG_DBG("%p, reason 0x%02x", chan, reason);

__ASSERT(chan->iso != NULL, "NULL conn for iso chan %p", chan);

/* release buffers from tx_queue */
while ((buf = k_fifo_get(&chan->iso->iso.txq, K_NO_WAIT))) {
__ASSERT_NO_MSG(!bt_buf_has_view(buf));
net_buf_unref(buf);
}

bt_iso_chan_set_state(chan, BT_ISO_STATE_DISCONNECTED);
bt_conn_set_state(chan->iso, BT_CONN_DISCONNECT_COMPLETE);

Expand Down Expand Up @@ -775,7 +783,8 @@ void bt_iso_recv(struct bt_conn *iso, struct net_buf *buf, uint8_t flags)
static bool iso_has_data(struct bt_conn *conn)
{
#if defined(CONFIG_BT_ISO_TX)
return !k_fifo_is_empty(&conn->iso.txq);
return ((conn->iso.chan->state == BT_ISO_STATE_CONNECTED) &&
!k_fifo_is_empty(&conn->iso.txq));
#else /* !CONFIG_BT_ISO_TX */
return false;
#endif /* CONFIG_BT_ISO_TX */
Expand All @@ -789,45 +798,45 @@ static struct net_buf *iso_data_pull(struct bt_conn *conn, size_t amount, size_t
/* Leave the PDU buffer in the queue until we have sent all its
* fragments.
*/
struct net_buf *frag = k_fifo_peek_head(&conn->iso.txq);
struct net_buf *q_frag = k_fifo_peek_head(&conn->iso.txq);

if (!frag) {
if (!q_frag) {
BT_ISO_DATA_DBG("signaled ready but no frag available");
/* Service other connections */
bt_tx_irq_raise();

return NULL;
}

if (conn->iso.chan->state != BT_ISO_STATE_CONNECTED) {
__maybe_unused struct net_buf *b = k_fifo_get(&conn->iso.txq, K_NO_WAIT);
__ASSERT_NO_MSG(conn->state == BT_CONN_CONNECTED);

if (conn->iso.chan->state != BT_ISO_STATE_CONNECTED) {
LOG_DBG("channel has been disconnected");
__ASSERT_NO_MSG(b == frag);

net_buf_unref(b);

/* Service other connections */
bt_tx_irq_raise();

return NULL;
}

if (bt_buf_has_view(frag)) {
if (bt_buf_has_view(q_frag)) {
/* This should not happen. conn.c should wait until the view is
* destroyed before requesting more data.
*/
LOG_DBG("already have view");
return NULL;
}

struct net_buf *frag = net_buf_ref(q_frag);
bool last_frag = amount >= frag->len;

if (last_frag) {
__maybe_unused struct net_buf *b = k_fifo_get(&conn->iso.txq, K_NO_WAIT);
q_frag = k_fifo_get(&conn->iso.txq, K_NO_WAIT);

BT_ISO_DATA_DBG("last frag, pop buf");
__ASSERT_NO_MSG(b == frag);
__ASSERT_NO_MSG(q_frag == frag);

net_buf_unref(q_frag);
}

*length = frag->len;
Expand Down
Loading
Loading