Skip to content

Commit

Permalink
tcp: switch rtt estimations to usec resolution
Browse files Browse the repository at this point in the history
Upcoming congestion controls for TCP require usec resolution for RTT
estimations. Millisecond resolution is simply not enough these days.

FQ/pacing in DC environments also require this change for finer control
and removal of bimodal behavior due to the current hack in
tcp_update_pacing_rate() for 'small rtt'

TCP_CONG_RTT_STAMP is no longer needed.

As Julian Anastasov pointed out, we need to keep user compatibility :
tcp_metrics used to export RTT and RTTVAR in msec resolution,
so we added RTT_US and RTTVAR_US. An iproute2 patch is needed
to use the new attributes if provided by the kernel.

In this example ss command displays a srtt of 32 usecs (10Gbit link)

lpk51:~# ./ss -i dst lpk52
Netid  State      Recv-Q Send-Q   Local Address:Port       Peer
Address:Port
tcp    ESTAB      0      1         10.246.11.51:42959
10.246.11.52:64614
         cubic wscale:6,6 rto:201 rtt:0.032/0.001 ato:40 mss:1448
cwnd:10 send
3620.0Mbps pacing_rate 7240.0Mbps unacked:1 rcv_rtt:993 rcv_space:29559

Updated iproute2 ip command displays :

lpk51:~# ./ip tcp_metrics | grep 10.246.11.52
10.246.11.52 age 561.914sec cwnd 10 rtt 274us rttvar 213us source
10.246.11.51

Old binary displays :

lpk51:~# ip tcp_metrics | grep 10.246.11.52
10.246.11.52 age 561.914sec cwnd 10 rtt 250us rttvar 125us source
10.246.11.51

With help from Julian Anastasov, Stephen Hemminger and Yuchung Cheng

Signed-off-by: Eric Dumazet <edumazet@google.com>
Acked-by: Neal Cardwell <ncardwell@google.com>
Cc: Stephen Hemminger <stephen@networkplumber.org>
Cc: Yuchung Cheng <ycheng@google.com>
Cc: Larry Brakmo <brakmo@google.com>
Cc: Julian Anastasov <ja@ssi.bg>
Signed-off-by: David S. Miller <davem@davemloft.net>
  • Loading branch information
Eric Dumazet authored and davem330 committed Feb 26, 2014
1 parent 363ec39 commit 740b0f1
Show file tree
Hide file tree
Showing 17 changed files with 174 additions and 169 deletions.
8 changes: 4 additions & 4 deletions include/linux/tcp.h
Original file line number Diff line number Diff line change
Expand Up @@ -201,10 +201,10 @@ struct tcp_sock {
u32 tlp_high_seq; /* snd_nxt at the time of TLP retransmit. */

/* RTT measurement */
u32 srtt; /* smoothed round trip time << 3 */
u32 mdev; /* medium deviation */
u32 mdev_max; /* maximal mdev for the last rtt period */
u32 rttvar; /* smoothed mdev_max */
u32 srtt_us; /* smoothed round trip time << 3 in usecs */
u32 mdev_us; /* medium deviation */
u32 mdev_max_us; /* maximal mdev for the last rtt period */
u32 rttvar_us; /* smoothed mdev_max */
u32 rtt_seq; /* sequence number to update rttvar */

u32 packets_out; /* Packets which are "in flight" */
Expand Down
10 changes: 7 additions & 3 deletions include/net/tcp.h
Original file line number Diff line number Diff line change
Expand Up @@ -31,6 +31,7 @@
#include <linux/crypto.h>
#include <linux/cryptohash.h>
#include <linux/kref.h>
#include <linux/ktime.h>

#include <net/inet_connection_sock.h>
#include <net/inet_timewait_sock.h>
Expand Down Expand Up @@ -478,7 +479,6 @@ int __cookie_v4_check(const struct iphdr *iph, const struct tcphdr *th,
struct sock *cookie_v4_check(struct sock *sk, struct sk_buff *skb,
struct ip_options *opt);
#ifdef CONFIG_SYN_COOKIES
#include <linux/ktime.h>

/* Syncookies use a monotonic timer which increments every 64 seconds.
* This counter is used both as a hash input and partially encoded into
Expand Down Expand Up @@ -619,7 +619,7 @@ static inline void tcp_bound_rto(const struct sock *sk)

static inline u32 __tcp_set_rto(const struct tcp_sock *tp)
{
return (tp->srtt >> 3) + tp->rttvar;
return usecs_to_jiffies((tp->srtt_us >> 3) + tp->rttvar_us);
}

static inline void __tcp_fast_path_on(struct tcp_sock *tp, u32 snd_wnd)
Expand Down Expand Up @@ -656,6 +656,11 @@ static inline u32 tcp_rto_min(struct sock *sk)
return rto_min;
}

static inline u32 tcp_rto_min_us(struct sock *sk)
{
return jiffies_to_usecs(tcp_rto_min(sk));
}

/* Compute the actual receive window we are currently advertising.
* Rcv_nxt can be after the window if our peer push more data
* than the offered window.
Expand Down Expand Up @@ -778,7 +783,6 @@ enum tcp_ca_event {
#define TCP_CA_BUF_MAX (TCP_CA_NAME_MAX*TCP_CA_MAX)

#define TCP_CONG_NON_RESTRICTED 0x1
#define TCP_CONG_RTT_STAMP 0x2

struct tcp_congestion_ops {
struct list_head list;
Expand Down
7 changes: 5 additions & 2 deletions include/uapi/linux/tcp_metrics.h
Original file line number Diff line number Diff line change
Expand Up @@ -11,12 +11,15 @@
#define TCP_METRICS_GENL_VERSION 0x1

enum tcp_metric_index {
TCP_METRIC_RTT,
TCP_METRIC_RTTVAR,
TCP_METRIC_RTT, /* in ms units */
TCP_METRIC_RTTVAR, /* in ms units */
TCP_METRIC_SSTHRESH,
TCP_METRIC_CWND,
TCP_METRIC_REORDERING,

TCP_METRIC_RTT_US, /* in usec units */
TCP_METRIC_RTTVAR_US, /* in usec units */

/* Always last. */
__TCP_METRIC_MAX,
};
Expand Down
8 changes: 4 additions & 4 deletions net/ipv4/tcp.c
Original file line number Diff line number Diff line change
Expand Up @@ -387,7 +387,7 @@ void tcp_init_sock(struct sock *sk)
INIT_LIST_HEAD(&tp->tsq_node);

icsk->icsk_rto = TCP_TIMEOUT_INIT;
tp->mdev = TCP_TIMEOUT_INIT;
tp->mdev_us = jiffies_to_usecs(TCP_TIMEOUT_INIT);

/* So many TCP implementations out there (incorrectly) count the
* initial SYN frame in their delayed-ACK and congestion control
Expand Down Expand Up @@ -2339,7 +2339,7 @@ int tcp_disconnect(struct sock *sk, int flags)

sk->sk_shutdown = 0;
sock_reset_flag(sk, SOCK_DONE);
tp->srtt = 0;
tp->srtt_us = 0;
if ((tp->write_seq += tp->max_window + 2) == 0)
tp->write_seq = 1;
icsk->icsk_backoff = 0;
Expand Down Expand Up @@ -2783,8 +2783,8 @@ void tcp_get_info(const struct sock *sk, struct tcp_info *info)

info->tcpi_pmtu = icsk->icsk_pmtu_cookie;
info->tcpi_rcv_ssthresh = tp->rcv_ssthresh;
info->tcpi_rtt = jiffies_to_usecs(tp->srtt)>>3;
info->tcpi_rttvar = jiffies_to_usecs(tp->mdev)>>2;
info->tcpi_rtt = tp->srtt_us >> 3;
info->tcpi_rttvar = tp->mdev_us >> 2;
info->tcpi_snd_ssthresh = tp->snd_ssthresh;
info->tcpi_snd_cwnd = tp->snd_cwnd;
info->tcpi_advmss = tp->advmss;
Expand Down
4 changes: 0 additions & 4 deletions net/ipv4/tcp_cubic.c
Original file line number Diff line number Diff line change
Expand Up @@ -476,10 +476,6 @@ static int __init cubictcp_register(void)
/* divide by bic_scale and by constant Srtt (100ms) */
do_div(cube_factor, bic_scale * 10);

/* hystart needs ms clock resolution */
if (hystart && HZ < 1000)
cubictcp.flags |= TCP_CONG_RTT_STAMP;

return tcp_register_congestion_control(&cubictcp);
}

Expand Down
12 changes: 7 additions & 5 deletions net/ipv4/tcp_hybla.c
Original file line number Diff line number Diff line change
Expand Up @@ -21,7 +21,7 @@ struct hybla {
u32 rho2; /* Rho * Rho, integer part */
u32 rho_3ls; /* Rho parameter, <<3 */
u32 rho2_7ls; /* Rho^2, <<7 */
u32 minrtt; /* Minimum smoothed round trip time value seen */
u32 minrtt_us; /* Minimum smoothed round trip time value seen */
};

/* Hybla reference round trip time (default= 1/40 sec = 25 ms), in ms */
Expand All @@ -35,7 +35,9 @@ static inline void hybla_recalc_param (struct sock *sk)
{
struct hybla *ca = inet_csk_ca(sk);

ca->rho_3ls = max_t(u32, tcp_sk(sk)->srtt / msecs_to_jiffies(rtt0), 8);
ca->rho_3ls = max_t(u32,
tcp_sk(sk)->srtt_us / (rtt0 * USEC_PER_MSEC),
8U);
ca->rho = ca->rho_3ls >> 3;
ca->rho2_7ls = (ca->rho_3ls * ca->rho_3ls) << 1;
ca->rho2 = ca->rho2_7ls >> 7;
Expand All @@ -59,7 +61,7 @@ static void hybla_init(struct sock *sk)
hybla_recalc_param(sk);

/* set minimum rtt as this is the 1st ever seen */
ca->minrtt = tp->srtt;
ca->minrtt_us = tp->srtt_us;
tp->snd_cwnd = ca->rho;
}

Expand Down Expand Up @@ -94,9 +96,9 @@ static void hybla_cong_avoid(struct sock *sk, u32 ack, u32 acked,
int is_slowstart = 0;

/* Recalculate rho only if this srtt is the lowest */
if (tp->srtt < ca->minrtt){
if (tp->srtt_us < ca->minrtt_us) {
hybla_recalc_param(sk);
ca->minrtt = tp->srtt;
ca->minrtt_us = tp->srtt_us;
}

if (!tcp_is_cwnd_limited(sk, in_flight))
Expand Down
1 change: 0 additions & 1 deletion net/ipv4/tcp_illinois.c
Original file line number Diff line number Diff line change
Expand Up @@ -325,7 +325,6 @@ static void tcp_illinois_info(struct sock *sk, u32 ext,
}

static struct tcp_congestion_ops tcp_illinois __read_mostly = {
.flags = TCP_CONG_RTT_STAMP,
.init = tcp_illinois_init,
.ssthresh = tcp_illinois_ssthresh,
.cong_avoid = tcp_illinois_cong_avoid,
Expand Down
Loading

0 comments on commit 740b0f1

Please sign in to comment.