LCOV - code coverage report
Current view: top level - net/ipv4 - tcp_cong.c (source / functions) Hit Total Coverage
Test: landlock.info Lines: 65 214 30.4 %
Date: 2021-04-22 12:43:58 Functions: 7 22 31.8 %

          Line data    Source code
       1             : // SPDX-License-Identifier: GPL-2.0-only
       2             : /*
       3             :  * Pluggable TCP congestion control support and newReno
       4             :  * congestion control.
       5             :  * Based on ideas from I/O scheduler support and Web100.
       6             :  *
       7             :  * Copyright (C) 2005 Stephen Hemminger <shemminger@osdl.org>
       8             :  */
       9             : 
      10             : #define pr_fmt(fmt) "TCP: " fmt
      11             : 
      12             : #include <linux/module.h>
      13             : #include <linux/mm.h>
      14             : #include <linux/types.h>
      15             : #include <linux/list.h>
      16             : #include <linux/gfp.h>
      17             : #include <linux/jhash.h>
      18             : #include <net/tcp.h>
      19             : 
      20             : static DEFINE_SPINLOCK(tcp_cong_list_lock);
      21             : static LIST_HEAD(tcp_cong_list);
      22             : 
      23             : /* Simple linear search, don't expect many entries! */
      24           1 : struct tcp_congestion_ops *tcp_ca_find(const char *name)
      25             : {
      26           1 :         struct tcp_congestion_ops *e;
      27             : 
      28           2 :         list_for_each_entry_rcu(e, &tcp_cong_list, list) {
      29           2 :                 if (strcmp(e->name, name) == 0)
      30           1 :                         return e;
      31             :         }
      32             : 
      33             :         return NULL;
      34             : }
      35             : 
      36             : /* Must be called with rcu lock held */
      37           1 : static struct tcp_congestion_ops *tcp_ca_find_autoload(struct net *net,
      38             :                                                        const char *name)
      39             : {
      40           2 :         struct tcp_congestion_ops *ca = tcp_ca_find(name);
      41             : 
      42             : #ifdef CONFIG_MODULES
      43             :         if (!ca && capable(CAP_NET_ADMIN)) {
      44             :                 rcu_read_unlock();
      45             :                 request_module("tcp_%s", name);
      46             :                 rcu_read_lock();
      47             :                 ca = tcp_ca_find(name);
      48             :         }
      49             : #endif
      50           0 :         return ca;
      51             : }
      52             : 
      53             : /* Simple linear search, not much in here. */
      54           2 : struct tcp_congestion_ops *tcp_ca_find_key(u32 key)
      55             : {
      56           2 :         struct tcp_congestion_ops *e;
      57             : 
      58           3 :         list_for_each_entry_rcu(e, &tcp_cong_list, list) {
      59           1 :                 if (e->key == key)
      60           0 :                         return e;
      61             :         }
      62             : 
      63             :         return NULL;
      64             : }
      65             : 
      66             : /*
      67             :  * Attach new congestion control algorithm to the list
      68             :  * of available options.
      69             :  */
      70           2 : int tcp_register_congestion_control(struct tcp_congestion_ops *ca)
      71             : {
      72           2 :         int ret = 0;
      73             : 
      74             :         /* all algorithms must implement these */
      75           2 :         if (!ca->ssthresh || !ca->undo_cwnd ||
      76           2 :             !(ca->cong_avoid || ca->cong_control)) {
      77           0 :                 pr_err("%s does not implement required ops\n", ca->name);
      78           0 :                 return -EINVAL;
      79             :         }
      80             : 
      81           2 :         ca->key = jhash(ca->name, sizeof(ca->name), strlen(ca->name));
      82             : 
      83           2 :         spin_lock(&tcp_cong_list_lock);
      84           4 :         if (ca->key == TCP_CA_UNSPEC || tcp_ca_find_key(ca->key)) {
      85           0 :                 pr_notice("%s already registered or non-unique key\n",
      86             :                           ca->name);
      87           0 :                 ret = -EEXIST;
      88             :         } else {
      89           2 :                 list_add_tail_rcu(&ca->list, &tcp_cong_list);
      90           2 :                 pr_debug("%s registered\n", ca->name);
      91             :         }
      92           2 :         spin_unlock(&tcp_cong_list_lock);
      93             : 
      94           2 :         return ret;
      95             : }
      96             : EXPORT_SYMBOL_GPL(tcp_register_congestion_control);
      97             : 
      98             : /*
      99             :  * Remove congestion control algorithm, called from
     100             :  * the module's remove function.  Module ref counts are used
     101             :  * to ensure that this can't be done till all sockets using
     102             :  * that method are closed.
     103             :  */
     104           0 : void tcp_unregister_congestion_control(struct tcp_congestion_ops *ca)
     105             : {
     106           0 :         spin_lock(&tcp_cong_list_lock);
     107           0 :         list_del_rcu(&ca->list);
     108           0 :         spin_unlock(&tcp_cong_list_lock);
     109             : 
     110             :         /* Wait for outstanding readers to complete before the
     111             :          * module gets removed entirely.
     112             :          *
     113             :          * A try_module_get() should fail by now as our module is
     114             :          * in "going" state since no refs are held anymore and
     115             :          * module_exit() handler being called.
     116             :          */
     117           0 :         synchronize_rcu();
     118           0 : }
     119             : EXPORT_SYMBOL_GPL(tcp_unregister_congestion_control);
     120             : 
     121           0 : u32 tcp_ca_get_key_by_name(struct net *net, const char *name, bool *ecn_ca)
     122             : {
     123           0 :         const struct tcp_congestion_ops *ca;
     124           0 :         u32 key = TCP_CA_UNSPEC;
     125             : 
     126           0 :         might_sleep();
     127             : 
     128           0 :         rcu_read_lock();
     129           0 :         ca = tcp_ca_find_autoload(net, name);
     130           0 :         if (ca) {
     131           0 :                 key = ca->key;
     132           0 :                 *ecn_ca = ca->flags & TCP_CONG_NEEDS_ECN;
     133             :         }
     134           0 :         rcu_read_unlock();
     135             : 
     136           0 :         return key;
     137             : }
     138             : EXPORT_SYMBOL_GPL(tcp_ca_get_key_by_name);
     139             : 
     140           0 : char *tcp_ca_get_name_by_key(u32 key, char *buffer)
     141             : {
     142           0 :         const struct tcp_congestion_ops *ca;
     143           0 :         char *ret = NULL;
     144             : 
     145           0 :         rcu_read_lock();
     146           0 :         ca = tcp_ca_find_key(key);
     147           0 :         if (ca)
     148           0 :                 ret = strncpy(buffer, ca->name,
     149             :                               TCP_CA_NAME_MAX);
     150           0 :         rcu_read_unlock();
     151             : 
     152           0 :         return ret;
     153             : }
     154             : EXPORT_SYMBOL_GPL(tcp_ca_get_name_by_key);
     155             : 
     156             : /* Assign choice of congestion control. */
     157           7 : void tcp_assign_congestion_control(struct sock *sk)
     158             : {
     159           7 :         struct net *net = sock_net(sk);
     160           7 :         struct inet_connection_sock *icsk = inet_csk(sk);
     161           7 :         const struct tcp_congestion_ops *ca;
     162             : 
     163           7 :         rcu_read_lock();
     164           7 :         ca = rcu_dereference(net->ipv4.tcp_congestion_control);
     165           7 :         if (unlikely(!bpf_try_module_get(ca, ca->owner)))
     166             :                 ca = &tcp_reno;
     167           7 :         icsk->icsk_ca_ops = ca;
     168           7 :         rcu_read_unlock();
     169             : 
     170           7 :         memset(icsk->icsk_ca_priv, 0, sizeof(icsk->icsk_ca_priv));
     171           7 :         if (ca->flags & TCP_CONG_NEEDS_ECN)
     172           0 :                 INET_ECN_xmit(sk);
     173             :         else
     174           7 :                 INET_ECN_dontxmit(sk);
     175           7 : }
     176             : 
     177           4 : void tcp_init_congestion_control(struct sock *sk)
     178             : {
     179           4 :         struct inet_connection_sock *icsk = inet_csk(sk);
     180             : 
     181           4 :         tcp_sk(sk)->prior_ssthresh = 0;
     182           4 :         if (icsk->icsk_ca_ops->init)
     183           4 :                 icsk->icsk_ca_ops->init(sk);
     184           4 :         if (tcp_ca_needs_ecn(sk))
     185           0 :                 INET_ECN_xmit(sk);
     186             :         else
     187           4 :                 INET_ECN_dontxmit(sk);
     188           4 :         icsk->icsk_ca_initialized = 1;
     189           4 : }
     190             : 
     191           0 : static void tcp_reinit_congestion_control(struct sock *sk,
     192             :                                           const struct tcp_congestion_ops *ca)
     193             : {
     194           0 :         struct inet_connection_sock *icsk = inet_csk(sk);
     195             : 
     196           0 :         tcp_cleanup_congestion_control(sk);
     197           0 :         icsk->icsk_ca_ops = ca;
     198           0 :         icsk->icsk_ca_setsockopt = 1;
     199           0 :         memset(icsk->icsk_ca_priv, 0, sizeof(icsk->icsk_ca_priv));
     200             : 
     201           0 :         if (ca->flags & TCP_CONG_NEEDS_ECN)
     202           0 :                 INET_ECN_xmit(sk);
     203             :         else
     204           0 :                 INET_ECN_dontxmit(sk);
     205             : 
     206           0 :         if (!((1 << sk->sk_state) & (TCPF_CLOSE | TCPF_LISTEN)))
     207           0 :                 tcp_init_congestion_control(sk);
     208           0 : }
     209             : 
     210             : /* Manage refcounts on socket close. */
     211           4 : void tcp_cleanup_congestion_control(struct sock *sk)
     212             : {
     213           4 :         struct inet_connection_sock *icsk = inet_csk(sk);
     214             : 
     215           4 :         if (icsk->icsk_ca_ops->release)
     216           0 :                 icsk->icsk_ca_ops->release(sk);
     217           0 :         bpf_module_put(icsk->icsk_ca_ops, icsk->icsk_ca_ops->owner);
     218           4 : }
     219             : 
     220             : /* Used by sysctl to change default congestion control */
     221           1 : int tcp_set_default_congestion_control(struct net *net, const char *name)
     222             : {
     223           1 :         struct tcp_congestion_ops *ca;
     224           1 :         const struct tcp_congestion_ops *prev;
     225           1 :         int ret;
     226             : 
     227           1 :         rcu_read_lock();
     228           1 :         ca = tcp_ca_find_autoload(net, name);
     229           1 :         if (!ca) {
     230             :                 ret = -ENOENT;
     231           1 :         } else if (!bpf_try_module_get(ca, ca->owner)) {
     232             :                 ret = -EBUSY;
     233             :         } else {
     234           1 :                 prev = xchg(&net->ipv4.tcp_congestion_control, ca);
     235           1 :                 if (prev)
     236           1 :                         bpf_module_put(prev, prev->owner);
     237             : 
     238           1 :                 ca->flags |= TCP_CONG_NON_RESTRICTED;
     239           1 :                 ret = 0;
     240             :         }
     241           1 :         rcu_read_unlock();
     242             : 
     243           1 :         return ret;
     244             : }
     245             : 
     246             : /* Set default value from kernel configuration at bootup */
     247           1 : static int __init tcp_congestion_default(void)
     248             : {
     249           1 :         return tcp_set_default_congestion_control(&init_net,
     250             :                                                   CONFIG_DEFAULT_TCP_CONG);
     251             : }
     252             : late_initcall(tcp_congestion_default);
     253             : 
     254             : /* Build string with list of available congestion control values */
     255           0 : void tcp_get_available_congestion_control(char *buf, size_t maxlen)
     256             : {
     257           0 :         struct tcp_congestion_ops *ca;
     258           0 :         size_t offs = 0;
     259             : 
     260           0 :         rcu_read_lock();
     261           0 :         list_for_each_entry_rcu(ca, &tcp_cong_list, list) {
     262           0 :                 offs += snprintf(buf + offs, maxlen - offs,
     263             :                                  "%s%s",
     264           0 :                                  offs == 0 ? "" : " ", ca->name);
     265             : 
     266           0 :                 if (WARN_ON_ONCE(offs >= maxlen))
     267             :                         break;
     268             :         }
     269           0 :         rcu_read_unlock();
     270           0 : }
     271             : 
     272             : /* Get current default congestion control */
     273           0 : void tcp_get_default_congestion_control(struct net *net, char *name)
     274             : {
     275           0 :         const struct tcp_congestion_ops *ca;
     276             : 
     277           0 :         rcu_read_lock();
     278           0 :         ca = rcu_dereference(net->ipv4.tcp_congestion_control);
     279           0 :         strncpy(name, ca->name, TCP_CA_NAME_MAX);
     280           0 :         rcu_read_unlock();
     281           0 : }
     282             : 
     283             : /* Built list of non-restricted congestion control values */
     284           0 : void tcp_get_allowed_congestion_control(char *buf, size_t maxlen)
     285             : {
     286           0 :         struct tcp_congestion_ops *ca;
     287           0 :         size_t offs = 0;
     288             : 
     289           0 :         *buf = '\0';
     290           0 :         rcu_read_lock();
     291           0 :         list_for_each_entry_rcu(ca, &tcp_cong_list, list) {
     292           0 :                 if (!(ca->flags & TCP_CONG_NON_RESTRICTED))
     293           0 :                         continue;
     294           0 :                 offs += snprintf(buf + offs, maxlen - offs,
     295             :                                  "%s%s",
     296           0 :                                  offs == 0 ? "" : " ", ca->name);
     297             : 
     298           0 :                 if (WARN_ON_ONCE(offs >= maxlen))
     299             :                         break;
     300             :         }
     301           0 :         rcu_read_unlock();
     302           0 : }
     303             : 
     304             : /* Change list of non-restricted congestion control */
     305           0 : int tcp_set_allowed_congestion_control(char *val)
     306             : {
     307           0 :         struct tcp_congestion_ops *ca;
     308           0 :         char *saved_clone, *clone, *name;
     309           0 :         int ret = 0;
     310             : 
     311           0 :         saved_clone = clone = kstrdup(val, GFP_USER);
     312           0 :         if (!clone)
     313             :                 return -ENOMEM;
     314             : 
     315           0 :         spin_lock(&tcp_cong_list_lock);
     316             :         /* pass 1 check for bad entries */
     317           0 :         while ((name = strsep(&clone, " ")) && *name) {
     318           0 :                 ca = tcp_ca_find(name);
     319           0 :                 if (!ca) {
     320           0 :                         ret = -ENOENT;
     321           0 :                         goto out;
     322             :                 }
     323             :         }
     324             : 
     325             :         /* pass 2 clear old values */
     326           0 :         list_for_each_entry_rcu(ca, &tcp_cong_list, list)
     327           0 :                 ca->flags &= ~TCP_CONG_NON_RESTRICTED;
     328             : 
     329             :         /* pass 3 mark as allowed */
     330           0 :         while ((name = strsep(&val, " ")) && *name) {
     331           0 :                 ca = tcp_ca_find(name);
     332           0 :                 WARN_ON(!ca);
     333           0 :                 if (ca)
     334           0 :                         ca->flags |= TCP_CONG_NON_RESTRICTED;
     335             :         }
     336           0 : out:
     337           0 :         spin_unlock(&tcp_cong_list_lock);
     338           0 :         kfree(saved_clone);
     339             : 
     340           0 :         return ret;
     341             : }
     342             : 
     343             : /* Change congestion control for socket. If load is false, then it is the
     344             :  * responsibility of the caller to call tcp_init_congestion_control or
     345             :  * tcp_reinit_congestion_control (if the current congestion control was
     346             :  * already initialized.
     347             :  */
     348           0 : int tcp_set_congestion_control(struct sock *sk, const char *name, bool load,
     349             :                                bool cap_net_admin)
     350             : {
     351           0 :         struct inet_connection_sock *icsk = inet_csk(sk);
     352           0 :         const struct tcp_congestion_ops *ca;
     353           0 :         int err = 0;
     354             : 
     355           0 :         if (icsk->icsk_ca_dst_locked)
     356             :                 return -EPERM;
     357             : 
     358           0 :         rcu_read_lock();
     359           0 :         if (!load)
     360           0 :                 ca = tcp_ca_find(name);
     361             :         else
     362           0 :                 ca = tcp_ca_find_autoload(sock_net(sk), name);
     363             : 
     364             :         /* No change asking for existing value */
     365           0 :         if (ca == icsk->icsk_ca_ops) {
     366           0 :                 icsk->icsk_ca_setsockopt = 1;
     367           0 :                 goto out;
     368             :         }
     369             : 
     370           0 :         if (!ca)
     371             :                 err = -ENOENT;
     372           0 :         else if (!((ca->flags & TCP_CONG_NON_RESTRICTED) || cap_net_admin))
     373             :                 err = -EPERM;
     374           0 :         else if (!bpf_try_module_get(ca, ca->owner))
     375             :                 err = -EBUSY;
     376             :         else
     377           0 :                 tcp_reinit_congestion_control(sk, ca);
     378           0 :  out:
     379           0 :         rcu_read_unlock();
     380           0 :         return err;
     381             : }
     382             : 
     383             : /* Slow start is used when congestion window is no greater than the slow start
     384             :  * threshold. We base on RFC2581 and also handle stretch ACKs properly.
     385             :  * We do not implement RFC3465 Appropriate Byte Counting (ABC) per se but
     386             :  * something better;) a packet is only considered (s)acked in its entirety to
     387             :  * defend the ACK attacks described in the RFC. Slow start processes a stretch
     388             :  * ACK of degree N as if N acks of degree 1 are received back to back except
     389             :  * ABC caps N to 2. Slow start exits when cwnd grows over ssthresh and
     390             :  * returns the leftover acks to adjust cwnd in congestion avoidance mode.
     391             :  */
     392           0 : u32 tcp_slow_start(struct tcp_sock *tp, u32 acked)
     393             : {
     394           0 :         u32 cwnd = min(tp->snd_cwnd + acked, tp->snd_ssthresh);
     395             : 
     396           0 :         acked -= cwnd - tp->snd_cwnd;
     397           0 :         tp->snd_cwnd = min(cwnd, tp->snd_cwnd_clamp);
     398             : 
     399           0 :         return acked;
     400             : }
     401             : EXPORT_SYMBOL_GPL(tcp_slow_start);
     402             : 
     403             : /* In theory this is tp->snd_cwnd += 1 / tp->snd_cwnd (or alternative w),
     404             :  * for every packet that was ACKed.
     405             :  */
     406           0 : void tcp_cong_avoid_ai(struct tcp_sock *tp, u32 w, u32 acked)
     407             : {
     408             :         /* If credits accumulated at a higher w, apply them gently now. */
     409           0 :         if (tp->snd_cwnd_cnt >= w) {
     410           0 :                 tp->snd_cwnd_cnt = 0;
     411           0 :                 tp->snd_cwnd++;
     412             :         }
     413             : 
     414           0 :         tp->snd_cwnd_cnt += acked;
     415           0 :         if (tp->snd_cwnd_cnt >= w) {
     416           0 :                 u32 delta = tp->snd_cwnd_cnt / w;
     417             : 
     418           0 :                 tp->snd_cwnd_cnt -= delta * w;
     419           0 :                 tp->snd_cwnd += delta;
     420             :         }
     421           0 :         tp->snd_cwnd = min(tp->snd_cwnd, tp->snd_cwnd_clamp);
     422           0 : }
     423             : EXPORT_SYMBOL_GPL(tcp_cong_avoid_ai);
     424             : 
     425             : /*
     426             :  * TCP Reno congestion control
     427             :  * This is special case used for fallback as well.
     428             :  */
     429             : /* This is Jacobson's slow start and congestion avoidance.
     430             :  * SIGCOMM '88, p. 328.
     431             :  */
     432           0 : void tcp_reno_cong_avoid(struct sock *sk, u32 ack, u32 acked)
     433             : {
     434           0 :         struct tcp_sock *tp = tcp_sk(sk);
     435             : 
     436           0 :         if (!tcp_is_cwnd_limited(sk))
     437             :                 return;
     438             : 
     439             :         /* In "safe" area, increase. */
     440           0 :         if (tcp_in_slow_start(tp)) {
     441           0 :                 acked = tcp_slow_start(tp, acked);
     442           0 :                 if (!acked)
     443             :                         return;
     444             :         }
     445             :         /* In dangerous area, increase slowly. */
     446           0 :         tcp_cong_avoid_ai(tp, tp->snd_cwnd, acked);
     447             : }
     448             : EXPORT_SYMBOL_GPL(tcp_reno_cong_avoid);
     449             : 
     450             : /* Slow start threshold is half the congestion window (min 2) */
     451           0 : u32 tcp_reno_ssthresh(struct sock *sk)
     452             : {
     453           0 :         const struct tcp_sock *tp = tcp_sk(sk);
     454             : 
     455           0 :         return max(tp->snd_cwnd >> 1U, 2U);
     456             : }
     457             : EXPORT_SYMBOL_GPL(tcp_reno_ssthresh);
     458             : 
     459           0 : u32 tcp_reno_undo_cwnd(struct sock *sk)
     460             : {
     461           0 :         const struct tcp_sock *tp = tcp_sk(sk);
     462             : 
     463           0 :         return max(tp->snd_cwnd, tp->prior_cwnd);
     464             : }
     465             : EXPORT_SYMBOL_GPL(tcp_reno_undo_cwnd);
     466             : 
     467             : struct tcp_congestion_ops tcp_reno = {
     468             :         .flags          = TCP_CONG_NON_RESTRICTED,
     469             :         .name           = "reno",
     470             :         .owner          = THIS_MODULE,
     471             :         .ssthresh       = tcp_reno_ssthresh,
     472             :         .cong_avoid     = tcp_reno_cong_avoid,
     473             :         .undo_cwnd      = tcp_reno_undo_cwnd,
     474             : };

Generated by: LCOV version 1.14