From 55d8a135e42feec3aa24d2aac411fb658374532f Mon Sep 17 00:00:00 2001
From: David Morley <morleyd@google.com>
Date: Fri, 14 Jul 2023 11:07:56 -0400
Subject: [PATCH 15/19] tcp: introduce per-route feature RTAX_FEATURE_ECN_LOW

Define and implement a new per-route feature, RTAX_FEATURE_ECN_LOW.

This feature indicates that the given destination network is a
low-latency ECN environment, meaning both that ECN CE marks are
applied by the network using a low-latency marking threshold and also
that TCP endpoints provide precise per-data-segment ECN feedback in
ACKs (where the ACK ECE flag echoes the received CE status of all
newly-acknowledged data segments). This feature indication can be used
by congestion control algorithms to decide how to interpret ECN
signals over the given destination network.

This feature is appropriate for datacenter-style ECN marking, such as
the ECN marking approach expected by DCTCP or BBR congestion control
modules.

Signed-off-by: David Morley <morleyd@google.com>
Signed-off-by: Neal Cardwell <ncardwell@google.com>
Signed-off-by: Yuchung Cheng <ycheng@google.com>
Tested-by: David Morley <morleyd@google.com>
Change-Id: I6bc06e9c6cb426fbae7243fc71c9a8c18175f5d3
Signed-off-by: Alexandre Frade <kernel@xanmod.org>
---
 include/net/tcp.h              | 10 ++++++++++
 include/uapi/linux/rtnetlink.h |  4 +++-
 net/ipv4/tcp_minisocks.c       |  2 ++
 net/ipv4/tcp_output.c          |  6 ++++--
 4 files changed, 19 insertions(+), 3 deletions(-)

--- a/include/net/tcp.h
+++ b/include/net/tcp.h
@@ -376,6 +376,7 @@ static inline void tcp_dec_quickack_mode
 #define	TCP_ECN_QUEUE_CWR	2
 #define	TCP_ECN_DEMAND_CWR	4
 #define	TCP_ECN_SEEN		8
+#define	TCP_ECN_LOW		16
 
 enum tcp_tw_status {
 	TCP_TW_SUCCESS = 0,
@@ -794,6 +795,15 @@ static inline void tcp_fast_path_check(s
 		tcp_fast_path_on(tp);
 }
 
+static inline void tcp_set_ecn_low_from_dst(struct sock *sk,
+					    const struct dst_entry *dst)
+{
+	struct tcp_sock *tp = tcp_sk(sk);
+
+	if (dst_feature(dst, RTAX_FEATURE_ECN_LOW))
+		tp->ecn_flags |= TCP_ECN_LOW;
+}
+
 u32 tcp_delack_max(const struct sock *sk);
 
 /* Compute the actual rto_min value */
--- a/include/uapi/linux/rtnetlink.h
+++ b/include/uapi/linux/rtnetlink.h
@@ -507,12 +507,14 @@ enum {
 #define RTAX_FEATURE_TIMESTAMP		(1 << 2) /* unused */
 #define RTAX_FEATURE_ALLFRAG		(1 << 3) /* unused */
 #define RTAX_FEATURE_TCP_USEC_TS	(1 << 4)
+#define RTAX_FEATURE_ECN_LOW		(1 << 5)
 
 #define RTAX_FEATURE_MASK	(RTAX_FEATURE_ECN |		\
 				 RTAX_FEATURE_SACK |		\
 				 RTAX_FEATURE_TIMESTAMP |	\
 				 RTAX_FEATURE_ALLFRAG |		\
-				 RTAX_FEATURE_TCP_USEC_TS)
+				 RTAX_FEATURE_TCP_USEC_TS |	\
+				 RTAX_FEATURE_ECN_LOW)
 
 struct rta_session {
 	__u8	proto;
--- a/net/ipv4/tcp_minisocks.c
+++ b/net/ipv4/tcp_minisocks.c
@@ -462,6 +462,8 @@ void tcp_ca_openreq_child(struct sock *s
 	u32 ca_key = dst_metric(dst, RTAX_CC_ALGO);
 	bool ca_got_dst = false;
 
+	tcp_set_ecn_low_from_dst(sk, dst);
+
 	if (ca_key != TCP_CA_UNSPEC) {
 		const struct tcp_congestion_ops *ca;
 
--- a/net/ipv4/tcp_output.c
+++ b/net/ipv4/tcp_output.c
@@ -339,10 +339,9 @@ static void tcp_ecn_send_syn(struct sock
 	bool bpf_needs_ecn = tcp_bpf_ca_needs_ecn(sk);
 	bool use_ecn = READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_ecn) == 1 ||
 		tcp_ca_needs_ecn(sk) || bpf_needs_ecn;
+	const struct dst_entry *dst = __sk_dst_get(sk);
 
 	if (!use_ecn) {
-		const struct dst_entry *dst = __sk_dst_get(sk);
-
 		if (dst && dst_feature(dst, RTAX_FEATURE_ECN))
 			use_ecn = true;
 	}
@@ -354,6 +353,9 @@ static void tcp_ecn_send_syn(struct sock
 		tp->ecn_flags = TCP_ECN_OK;
 		if (tcp_ca_needs_ecn(sk) || bpf_needs_ecn)
 			INET_ECN_xmit(sk);
+
+		if (dst)
+			tcp_set_ecn_low_from_dst(sk, dst);
 	}
 }
 
