LCOV - code coverage report
Current view: top level - net/ipv4 - tcp_input.c (source / functions) Hit Total Coverage
Test: landlock.info Lines: 903 3051 29.6 %
Date: 2021-04-22 12:43:58 Functions: 56 145 38.6 %

          Line data    Source code
       1             : // SPDX-License-Identifier: GPL-2.0
       2             : /*
       3             :  * INET         An implementation of the TCP/IP protocol suite for the LINUX
       4             :  *              operating system.  INET is implemented using the  BSD Socket
       5             :  *              interface as the means of communication with the user level.
       6             :  *
       7             :  *              Implementation of the Transmission Control Protocol(TCP).
       8             :  *
       9             :  * Authors:     Ross Biro
      10             :  *              Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
      11             :  *              Mark Evans, <evansmp@uhura.aston.ac.uk>
      12             :  *              Corey Minyard <wf-rch!minyard@relay.EU.net>
      13             :  *              Florian La Roche, <flla@stud.uni-sb.de>
      14             :  *              Charles Hedrick, <hedrick@klinzhai.rutgers.edu>
      15             :  *              Linus Torvalds, <torvalds@cs.helsinki.fi>
      16             :  *              Alan Cox, <gw4pts@gw4pts.ampr.org>
      17             :  *              Matthew Dillon, <dillon@apollo.west.oic.com>
      18             :  *              Arnt Gulbrandsen, <agulbra@nvg.unit.no>
      19             :  *              Jorge Cwik, <jorge@laser.satlink.net>
      20             :  */
      21             : 
      22             : /*
      23             :  * Changes:
      24             :  *              Pedro Roque     :       Fast Retransmit/Recovery.
      25             :  *                                      Two receive queues.
      26             :  *                                      Retransmit queue handled by TCP.
      27             :  *                                      Better retransmit timer handling.
      28             :  *                                      New congestion avoidance.
      29             :  *                                      Header prediction.
      30             :  *                                      Variable renaming.
      31             :  *
      32             :  *              Eric            :       Fast Retransmit.
      33             :  *              Randy Scott     :       MSS option defines.
      34             :  *              Eric Schenk     :       Fixes to slow start algorithm.
      35             :  *              Eric Schenk     :       Yet another double ACK bug.
      36             :  *              Eric Schenk     :       Delayed ACK bug fixes.
      37             :  *              Eric Schenk     :       Floyd style fast retrans war avoidance.
      38             :  *              David S. Miller :       Don't allow zero congestion window.
      39             :  *              Eric Schenk     :       Fix retransmitter so that it sends
      40             :  *                                      next packet on ack of previous packet.
      41             :  *              Andi Kleen      :       Moved open_request checking here
      42             :  *                                      and process RSTs for open_requests.
      43             :  *              Andi Kleen      :       Better prune_queue, and other fixes.
      44             :  *              Andrey Savochkin:       Fix RTT measurements in the presence of
      45             :  *                                      timestamps.
      46             :  *              Andrey Savochkin:       Check sequence numbers correctly when
      47             :  *                                      removing SACKs due to in sequence incoming
      48             :  *                                      data segments.
      49             :  *              Andi Kleen:             Make sure we never ack data there is not
      50             :  *                                      enough room for. Also make this condition
      51             :  *                                      a fatal error if it might still happen.
      52             :  *              Andi Kleen:             Add tcp_measure_rcv_mss to make
      53             :  *                                      connections with MSS<min(MTU,ann. MSS)
      54             :  *                                      work without delayed acks.
      55             :  *              Andi Kleen:             Process packets with PSH set in the
      56             :  *                                      fast path.
      57             :  *              J Hadi Salim:           ECN support
      58             :  *              Andrei Gurtov,
      59             :  *              Pasi Sarolahti,
      60             :  *              Panu Kuhlberg:          Experimental audit of TCP (re)transmission
      61             :  *                                      engine. Lots of bugs are found.
      62             :  *              Pasi Sarolahti:         F-RTO for dealing with spurious RTOs
      63             :  */
      64             : 
      65             : #define pr_fmt(fmt) "TCP: " fmt
      66             : 
      67             : #include <linux/mm.h>
      68             : #include <linux/slab.h>
      69             : #include <linux/module.h>
      70             : #include <linux/sysctl.h>
      71             : #include <linux/kernel.h>
      72             : #include <linux/prefetch.h>
      73             : #include <net/dst.h>
      74             : #include <net/tcp.h>
      75             : #include <net/inet_common.h>
      76             : #include <linux/ipsec.h>
      77             : #include <asm/unaligned.h>
      78             : #include <linux/errqueue.h>
      79             : #include <trace/events/tcp.h>
      80             : #include <linux/jump_label_ratelimit.h>
      81             : #include <net/busy_poll.h>
      82             : #include <net/mptcp.h>
      83             : 
      84             : int sysctl_tcp_max_orphans __read_mostly = NR_FILE;
      85             : 
      86             : #define FLAG_DATA               0x01 /* Incoming frame contained data.          */
      87             : #define FLAG_WIN_UPDATE         0x02 /* Incoming ACK was a window update.       */
      88             : #define FLAG_DATA_ACKED         0x04 /* This ACK acknowledged new data.         */
      89             : #define FLAG_RETRANS_DATA_ACKED 0x08 /* "" "" some of which was retransmitted.      */
      90             : #define FLAG_SYN_ACKED          0x10 /* This ACK acknowledged SYN.              */
      91             : #define FLAG_DATA_SACKED        0x20 /* New SACK.                               */
      92             : #define FLAG_ECE                0x40 /* ECE in this ACK                         */
      93             : #define FLAG_LOST_RETRANS       0x80 /* This ACK marks some retransmission lost */
      94             : #define FLAG_SLOWPATH           0x100 /* Do not skip RFC checks for window update.*/
      95             : #define FLAG_ORIG_SACK_ACKED    0x200 /* Never retransmitted data are (s)acked  */
      96             : #define FLAG_SND_UNA_ADVANCED   0x400 /* Snd_una was changed (!= FLAG_DATA_ACKED) */
      97             : #define FLAG_DSACKING_ACK       0x800 /* SACK blocks contained D-SACK info */
      98             : #define FLAG_SET_XMIT_TIMER     0x1000 /* Set TLP or RTO timer */
      99             : #define FLAG_SACK_RENEGING      0x2000 /* snd_una advanced to a sacked seq */
     100             : #define FLAG_UPDATE_TS_RECENT   0x4000 /* tcp_replace_ts_recent() */
     101             : #define FLAG_NO_CHALLENGE_ACK   0x8000 /* do not call tcp_send_challenge_ack()  */
     102             : #define FLAG_ACK_MAYBE_DELAYED  0x10000 /* Likely a delayed ACK */
     103             : 
     104             : #define FLAG_ACKED              (FLAG_DATA_ACKED|FLAG_SYN_ACKED)
     105             : #define FLAG_NOT_DUP            (FLAG_DATA|FLAG_WIN_UPDATE|FLAG_ACKED)
     106             : #define FLAG_CA_ALERT           (FLAG_DATA_SACKED|FLAG_ECE|FLAG_DSACKING_ACK)
     107             : #define FLAG_FORWARD_PROGRESS   (FLAG_ACKED|FLAG_DATA_SACKED)
     108             : 
     109             : #define TCP_REMNANT (TCP_FLAG_FIN|TCP_FLAG_URG|TCP_FLAG_SYN|TCP_FLAG_PSH)
     110             : #define TCP_HP_BITS (~(TCP_RESERVED_BITS|TCP_FLAG_PSH))
     111             : 
     112             : #define REXMIT_NONE     0 /* no loss recovery to do */
     113             : #define REXMIT_LOST     1 /* retransmit packets marked lost */
     114             : #define REXMIT_NEW      2 /* FRTO-style transmit of unsent/new packets */
     115             : 
     116             : #if IS_ENABLED(CONFIG_TLS_DEVICE)
     117             : static DEFINE_STATIC_KEY_DEFERRED_FALSE(clean_acked_data_enabled, HZ);
     118             : 
     119             : void clean_acked_data_enable(struct inet_connection_sock *icsk,
     120             :                              void (*cad)(struct sock *sk, u32 ack_seq))
     121             : {
     122             :         icsk->icsk_clean_acked = cad;
     123             :         static_branch_deferred_inc(&clean_acked_data_enabled);
     124             : }
     125             : EXPORT_SYMBOL_GPL(clean_acked_data_enable);
     126             : 
     127             : void clean_acked_data_disable(struct inet_connection_sock *icsk)
     128             : {
     129             :         static_branch_slow_dec_deferred(&clean_acked_data_enabled);
     130             :         icsk->icsk_clean_acked = NULL;
     131             : }
     132             : EXPORT_SYMBOL_GPL(clean_acked_data_disable);
     133             : 
     134             : void clean_acked_data_flush(void)
     135             : {
     136             :         static_key_deferred_flush(&clean_acked_data_enabled);
     137             : }
     138             : EXPORT_SYMBOL_GPL(clean_acked_data_flush);
     139             : #endif
     140             : 
     141             : #ifdef CONFIG_CGROUP_BPF
     142             : static void bpf_skops_parse_hdr(struct sock *sk, struct sk_buff *skb)
     143             : {
     144             :         bool unknown_opt = tcp_sk(sk)->rx_opt.saw_unknown &&
     145             :                 BPF_SOCK_OPS_TEST_FLAG(tcp_sk(sk),
     146             :                                        BPF_SOCK_OPS_PARSE_UNKNOWN_HDR_OPT_CB_FLAG);
     147             :         bool parse_all_opt = BPF_SOCK_OPS_TEST_FLAG(tcp_sk(sk),
     148             :                                                     BPF_SOCK_OPS_PARSE_ALL_HDR_OPT_CB_FLAG);
     149             :         struct bpf_sock_ops_kern sock_ops;
     150             : 
     151             :         if (likely(!unknown_opt && !parse_all_opt))
     152             :                 return;
     153             : 
     154             :         /* The skb will be handled in the
     155             :          * bpf_skops_established() or
     156             :          * bpf_skops_write_hdr_opt().
     157             :          */
     158             :         switch (sk->sk_state) {
     159             :         case TCP_SYN_RECV:
     160             :         case TCP_SYN_SENT:
     161             :         case TCP_LISTEN:
     162             :                 return;
     163             :         }
     164             : 
     165             :         sock_owned_by_me(sk);
     166             : 
     167             :         memset(&sock_ops, 0, offsetof(struct bpf_sock_ops_kern, temp));
     168             :         sock_ops.op = BPF_SOCK_OPS_PARSE_HDR_OPT_CB;
     169             :         sock_ops.is_fullsock = 1;
     170             :         sock_ops.sk = sk;
     171             :         bpf_skops_init_skb(&sock_ops, skb, tcp_hdrlen(skb));
     172             : 
     173             :         BPF_CGROUP_RUN_PROG_SOCK_OPS(&sock_ops);
     174             : }
     175             : 
     176             : static void bpf_skops_established(struct sock *sk, int bpf_op,
     177             :                                   struct sk_buff *skb)
     178             : {
     179             :         struct bpf_sock_ops_kern sock_ops;
     180             : 
     181             :         sock_owned_by_me(sk);
     182             : 
     183             :         memset(&sock_ops, 0, offsetof(struct bpf_sock_ops_kern, temp));
     184             :         sock_ops.op = bpf_op;
     185             :         sock_ops.is_fullsock = 1;
     186             :         sock_ops.sk = sk;
     187             :         /* sk with TCP_REPAIR_ON does not have skb in tcp_finish_connect */
     188             :         if (skb)
     189             :                 bpf_skops_init_skb(&sock_ops, skb, tcp_hdrlen(skb));
     190             : 
     191             :         BPF_CGROUP_RUN_PROG_SOCK_OPS(&sock_ops);
     192             : }
     193             : #else
     194             : static void bpf_skops_parse_hdr(struct sock *sk, struct sk_buff *skb)
     195             : {
     196             : }
     197             : 
     198           4 : static void bpf_skops_established(struct sock *sk, int bpf_op,
     199             :                                   struct sk_buff *skb)
     200             : {
     201           4 : }
     202             : #endif
     203             : 
     204           0 : static void tcp_gro_dev_warn(struct sock *sk, const struct sk_buff *skb,
     205             :                              unsigned int len)
     206             : {
     207           0 :         static bool __once __read_mostly;
     208             : 
     209           0 :         if (!__once) {
     210           0 :                 struct net_device *dev;
     211             : 
     212           0 :                 __once = true;
     213             : 
     214           0 :                 rcu_read_lock();
     215           0 :                 dev = dev_get_by_index_rcu(sock_net(sk), skb->skb_iif);
     216           0 :                 if (!dev || len >= dev->mtu)
     217           0 :                         pr_warn("%s: Driver has suspect GRO implementation, TCP performance may be compromised.\n",
     218             :                                 dev ? dev->name : "Unknown driver");
     219           0 :                 rcu_read_unlock();
     220             :         }
     221           0 : }
     222             : 
     223             : /* Adapt the MSS value used to make delayed ack decision to the
     224             :  * real world.
     225             :  */
     226          67 : static void tcp_measure_rcv_mss(struct sock *sk, const struct sk_buff *skb)
     227             : {
     228          67 :         struct inet_connection_sock *icsk = inet_csk(sk);
     229          67 :         const unsigned int lss = icsk->icsk_ack.last_seg_size;
     230          67 :         unsigned int len;
     231             : 
     232          67 :         icsk->icsk_ack.last_seg_size = 0;
     233             : 
     234             :         /* skb->len may jitter because of SACKs, even if peer
     235             :          * sends good full-sized frames.
     236             :          */
     237          67 :         len = skb_shinfo(skb)->gso_size ? : skb->len;
     238          67 :         if (len >= icsk->icsk_ack.rcv_mss) {
     239          20 :                 icsk->icsk_ack.rcv_mss = min_t(unsigned int, len,
     240             :                                                tcp_sk(sk)->advmss);
     241             :                 /* Account for possibly-removed options */
     242          20 :                 if (unlikely(len > icsk->icsk_ack.rcv_mss +
     243             :                                    MAX_TCP_OPTION_SPACE))
     244           0 :                         tcp_gro_dev_warn(sk, skb, len);
     245             :         } else {
     246             :                 /* Otherwise, we make more careful check taking into account,
     247             :                  * that SACKs block is variable.
     248             :                  *
     249             :                  * "len" is invariant segment length, including TCP header.
     250             :                  */
     251          47 :                 len += skb->data - skb_transport_header(skb);
     252          47 :                 if (len >= TCP_MSS_DEFAULT + sizeof(struct tcphdr) ||
     253             :                     /* If PSH is not set, packet should be
     254             :                      * full sized, provided peer TCP is not badly broken.
     255             :                      * This observation (if it is correct 8)) allows
     256             :                      * to handle super-low mtu links fairly.
     257             :                      */
     258          30 :                     (len >= TCP_MIN_MSS + sizeof(struct tcphdr) &&
     259          30 :                      !(tcp_flag_word(tcp_hdr(skb)) & TCP_REMNANT))) {
     260             :                         /* Subtract also invariant (if peer is RFC compliant),
     261             :                          * tcp header plus fixed timestamp option length.
     262             :                          * Resulting "len" is MSS free of SACK jitter.
     263             :                          */
     264           0 :                         len -= tcp_sk(sk)->tcp_header_len;
     265           0 :                         icsk->icsk_ack.last_seg_size = len;
     266           0 :                         if (len == lss) {
     267           0 :                                 icsk->icsk_ack.rcv_mss = len;
     268           0 :                                 return;
     269             :                         }
     270             :                 }
     271          47 :                 if (icsk->icsk_ack.pending & ICSK_ACK_PUSHED)
     272           1 :                         icsk->icsk_ack.pending |= ICSK_ACK_PUSHED2;
     273          47 :                 icsk->icsk_ack.pending |= ICSK_ACK_PUSHED;
     274             :         }
     275             : }
     276             : 
     277           8 : static void tcp_incr_quickack(struct sock *sk, unsigned int max_quickacks)
     278             : {
     279           8 :         struct inet_connection_sock *icsk = inet_csk(sk);
     280           8 :         unsigned int quickacks = tcp_sk(sk)->rcv_wnd / (2 * icsk->icsk_ack.rcv_mss);
     281             : 
     282           8 :         if (quickacks == 0)
     283           0 :                 quickacks = 2;
     284           8 :         quickacks = min(quickacks, max_quickacks);
     285           8 :         if (quickacks > icsk->icsk_ack.quick)
     286           8 :                 icsk->icsk_ack.quick = quickacks;
     287             : }
     288             : 
     289           0 : void tcp_enter_quickack_mode(struct sock *sk, unsigned int max_quickacks)
     290             : {
     291           0 :         struct inet_connection_sock *icsk = inet_csk(sk);
     292             : 
     293           0 :         tcp_incr_quickack(sk, max_quickacks);
     294           0 :         inet_csk_exit_pingpong_mode(sk);
     295           0 :         icsk->icsk_ack.ato = TCP_ATO_MIN;
     296           0 : }
     297             : EXPORT_SYMBOL(tcp_enter_quickack_mode);
     298             : 
     299             : /* Send ACKs quickly, if "quick" count is not exhausted
     300             :  * and the session is not interactive.
     301             :  */
     302             : 
     303          58 : static bool tcp_in_quickack_mode(struct sock *sk)
     304             : {
     305          58 :         const struct inet_connection_sock *icsk = inet_csk(sk);
     306          58 :         const struct dst_entry *dst = __sk_dst_get(sk);
     307             : 
     308          58 :         return (dst && dst_metric(dst, RTAX_QUICKACK)) ||
     309          58 :                 (icsk->icsk_ack.quick && !inet_csk_in_pingpong_mode(sk));
     310             : }
     311             : 
     312           0 : static void tcp_ecn_queue_cwr(struct tcp_sock *tp)
     313             : {
     314           0 :         if (tp->ecn_flags & TCP_ECN_OK)
     315           0 :                 tp->ecn_flags |= TCP_ECN_QUEUE_CWR;
     316             : }
     317             : 
     318         370 : static void tcp_ecn_accept_cwr(struct sock *sk, const struct sk_buff *skb)
     319             : {
     320         370 :         if (tcp_hdr(skb)->cwr) {
     321           0 :                 tcp_sk(sk)->ecn_flags &= ~TCP_ECN_DEMAND_CWR;
     322             : 
     323             :                 /* If the sender is telling us it has entered CWR, then its
     324             :                  * cwnd may be very low (even just 1 packet), so we should ACK
     325             :                  * immediately.
     326             :                  */
     327           0 :                 if (TCP_SKB_CB(skb)->seq != TCP_SKB_CB(skb)->end_seq)
     328           0 :                         inet_csk(sk)->icsk_ack.pending |= ICSK_ACK_NOW;
     329             :         }
     330         370 : }
     331             : 
     332           0 : static void tcp_ecn_withdraw_cwr(struct tcp_sock *tp)
     333             : {
     334           0 :         tp->ecn_flags &= ~TCP_ECN_QUEUE_CWR;
     335           0 : }
     336             : 
     337           0 : static void __tcp_ecn_check_ce(struct sock *sk, const struct sk_buff *skb)
     338             : {
     339           0 :         struct tcp_sock *tp = tcp_sk(sk);
     340             : 
     341           0 :         switch (TCP_SKB_CB(skb)->ip_dsfield & INET_ECN_MASK) {
     342           0 :         case INET_ECN_NOT_ECT:
     343             :                 /* Funny extension: if ECT is not set on a segment,
     344             :                  * and we already seen ECT on a previous segment,
     345             :                  * it is probably a retransmit.
     346             :                  */
     347           0 :                 if (tp->ecn_flags & TCP_ECN_SEEN)
     348           0 :                         tcp_enter_quickack_mode(sk, 2);
     349             :                 break;
     350             :         case INET_ECN_CE:
     351           0 :                 if (tcp_ca_needs_ecn(sk))
     352           0 :                         tcp_ca_event(sk, CA_EVENT_ECN_IS_CE);
     353             : 
     354           0 :                 if (!(tp->ecn_flags & TCP_ECN_DEMAND_CWR)) {
     355             :                         /* Better not delay acks, sender can have a very low cwnd */
     356           0 :                         tcp_enter_quickack_mode(sk, 2);
     357           0 :                         tp->ecn_flags |= TCP_ECN_DEMAND_CWR;
     358             :                 }
     359           0 :                 tp->ecn_flags |= TCP_ECN_SEEN;
     360           0 :                 break;
     361             :         default:
     362           0 :                 if (tcp_ca_needs_ecn(sk))
     363           0 :                         tcp_ca_event(sk, CA_EVENT_ECN_NO_CE);
     364           0 :                 tp->ecn_flags |= TCP_ECN_SEEN;
     365           0 :                 break;
     366             :         }
     367           0 : }
     368             : 
     369          67 : static void tcp_ecn_check_ce(struct sock *sk, const struct sk_buff *skb)
     370             : {
     371          67 :         if (tcp_sk(sk)->ecn_flags & TCP_ECN_OK)
     372           0 :                 __tcp_ecn_check_ce(sk, skb);
     373          67 : }
     374             : 
     375           0 : static void tcp_ecn_rcv_synack(struct tcp_sock *tp, const struct tcphdr *th)
     376             : {
     377           0 :         if ((tp->ecn_flags & TCP_ECN_OK) && (!th->ece || th->cwr))
     378           0 :                 tp->ecn_flags &= ~TCP_ECN_OK;
     379             : }
     380             : 
     381           0 : static void tcp_ecn_rcv_syn(struct tcp_sock *tp, const struct tcphdr *th)
     382             : {
     383           0 :         if ((tp->ecn_flags & TCP_ECN_OK) && (!th->ece || !th->cwr))
     384           0 :                 tp->ecn_flags &= ~TCP_ECN_OK;
     385             : }
     386             : 
     387          22 : static bool tcp_ecn_rcv_ecn_echo(const struct tcp_sock *tp, const struct tcphdr *th)
     388             : {
     389          22 :         if (th->ece && !th->syn && (tp->ecn_flags & TCP_ECN_OK))
     390             :                 return true;
     391             :         return false;
     392             : }
     393             : 
     394             : /* Buffer size and advertised window tuning.
     395             :  *
     396             :  * 1. Tuning sk->sk_sndbuf, when connection enters established state.
     397             :  */
     398             : 
     399           4 : static void tcp_sndbuf_expand(struct sock *sk)
     400             : {
     401           4 :         const struct tcp_sock *tp = tcp_sk(sk);
     402           4 :         const struct tcp_congestion_ops *ca_ops = inet_csk(sk)->icsk_ca_ops;
     403           4 :         int sndmem, per_mss;
     404           4 :         u32 nr_segs;
     405             : 
     406             :         /* Worst case is non GSO/TSO : each frame consumes one skb
     407             :          * and skb->head is kmalloced using power of two area of memory
     408             :          */
     409           4 :         per_mss = max_t(u32, tp->rx_opt.mss_clamp, tp->mss_cache) +
     410           4 :                   MAX_TCP_HEADER +
     411             :                   SKB_DATA_ALIGN(sizeof(struct skb_shared_info));
     412             : 
     413           4 :         per_mss = roundup_pow_of_two(per_mss) +
     414             :                   SKB_DATA_ALIGN(sizeof(struct sk_buff));
     415             : 
     416           4 :         nr_segs = max_t(u32, TCP_INIT_CWND, tp->snd_cwnd);
     417           4 :         nr_segs = max_t(u32, nr_segs, tp->reordering + 1);
     418             : 
     419             :         /* Fast Recovery (RFC 5681 3.2) :
     420             :          * Cubic needs 1.7 factor, rounded to 2 to include
     421             :          * extra cushion (application might react slowly to EPOLLOUT)
     422             :          */
     423           4 :         sndmem = ca_ops->sndbuf_expand ? ca_ops->sndbuf_expand(sk) : 2;
     424           4 :         sndmem *= nr_segs * per_mss;
     425             : 
     426           4 :         if (sk->sk_sndbuf < sndmem)
     427           4 :                 WRITE_ONCE(sk->sk_sndbuf,
     428             :                            min(sndmem, sock_net(sk)->ipv4.sysctl_tcp_wmem[2]));
     429           4 : }
     430             : 
     431             : /* 2. Tuning advertised window (window_clamp, rcv_ssthresh)
     432             :  *
     433             :  * All tcp_full_space() is split to two parts: "network" buffer, allocated
     434             :  * forward and advertised in receiver window (tp->rcv_wnd) and
     435             :  * "application buffer", required to isolate scheduling/application
     436             :  * latencies from network.
     437             :  * window_clamp is maximal advertised window. It can be less than
     438             :  * tcp_full_space(), in this case tcp_full_space() - window_clamp
     439             :  * is reserved for "application" buffer. The less window_clamp is
     440             :  * the smoother our behaviour from viewpoint of network, but the lower
     441             :  * throughput and the higher sensitivity of the connection to losses. 8)
     442             :  *
     443             :  * rcv_ssthresh is more strict window_clamp used at "slow start"
     444             :  * phase to predict further behaviour of this connection.
     445             :  * It is used for two goals:
     446             :  * - to enforce header prediction at sender, even when application
     447             :  *   requires some significant "application buffer". It is check #1.
     448             :  * - to prevent pruning of receive queue because of misprediction
     449             :  *   of receiver window. Check #2.
     450             :  *
     451             :  * The scheme does not work when sender sends good segments opening
     452             :  * window and then starts to feed us spaghetti. But it should work
     453             :  * in common situations. Otherwise, we have to rely on queue collapsing.
     454             :  */
     455             : 
     456             : /* Slow part of check#2. */
     457           0 : static int __tcp_grow_window(const struct sock *sk, const struct sk_buff *skb)
     458             : {
     459           0 :         struct tcp_sock *tp = tcp_sk(sk);
     460             :         /* Optimize this! */
     461           0 :         int truesize = tcp_win_from_space(sk, skb->truesize) >> 1;
     462           0 :         int window = tcp_win_from_space(sk, sock_net(sk)->ipv4.sysctl_tcp_rmem[2]) >> 1;
     463             : 
     464           0 :         while (tp->rcv_ssthresh <= window) {
     465           0 :                 if (truesize <= skb->len)
     466           0 :                         return 2 * inet_csk(sk)->icsk_ack.rcv_mss;
     467             : 
     468           0 :                 truesize >>= 1;
     469           0 :                 window >>= 1;
     470             :         }
     471             :         return 0;
     472             : }
     473             : 
     474          33 : static void tcp_grow_window(struct sock *sk, const struct sk_buff *skb)
     475             : {
     476          33 :         struct tcp_sock *tp = tcp_sk(sk);
     477          33 :         int room;
     478             : 
     479          33 :         room = min_t(int, tp->window_clamp, tcp_space(sk)) - tp->rcv_ssthresh;
     480             : 
     481             :         /* Check #1 */
     482          33 :         if (room > 0 && !tcp_under_memory_pressure(sk)) {
     483          11 :                 int incr;
     484             : 
     485             :                 /* Check #2. Increase window, if skb with such overhead
     486             :                  * will fit to rcvbuf in future.
     487             :                  */
     488          22 :                 if (tcp_win_from_space(sk, skb->truesize) <= skb->len)
     489          11 :                         incr = 2 * tp->advmss;
     490             :                 else
     491           0 :                         incr = __tcp_grow_window(sk, skb);
     492             : 
     493          11 :                 if (incr) {
     494          11 :                         incr = max_t(int, incr, 2 * skb->len);
     495          11 :                         tp->rcv_ssthresh += min(room, incr);
     496          11 :                         inet_csk(sk)->icsk_ack.quick |= 1;
     497             :                 }
     498             :         }
     499          33 : }
     500             : 
     501             : /* 3. Try to fixup all. It is made immediately after connection enters
     502             :  *    established state.
     503             :  */
     504           4 : static void tcp_init_buffer_space(struct sock *sk)
     505             : {
     506           4 :         int tcp_app_win = sock_net(sk)->ipv4.sysctl_tcp_app_win;
     507           4 :         struct tcp_sock *tp = tcp_sk(sk);
     508           4 :         int maxwin;
     509             : 
     510           4 :         if (!(sk->sk_userlocks & SOCK_SNDBUF_LOCK))
     511           4 :                 tcp_sndbuf_expand(sk);
     512             : 
     513           4 :         tcp_mstamp_refresh(tp);
     514           4 :         tp->rcvq_space.time = tp->tcp_mstamp;
     515           4 :         tp->rcvq_space.seq = tp->copied_seq;
     516             : 
     517           4 :         maxwin = tcp_full_space(sk);
     518             : 
     519           4 :         if (tp->window_clamp >= maxwin) {
     520           0 :                 tp->window_clamp = maxwin;
     521             : 
     522           0 :                 if (tcp_app_win && maxwin > 4 * tp->advmss)
     523           0 :                         tp->window_clamp = max(maxwin -
     524             :                                                (maxwin >> tcp_app_win),
     525             :                                                4 * tp->advmss);
     526             :         }
     527             : 
     528             :         /* Force reservation of one segment. */
     529           4 :         if (tcp_app_win &&
     530           4 :             tp->window_clamp > 2 * tp->advmss &&
     531           4 :             tp->window_clamp + tp->advmss > maxwin)
     532           4 :                 tp->window_clamp = max(2 * tp->advmss, maxwin - tp->advmss);
     533             : 
     534           4 :         tp->rcv_ssthresh = min(tp->rcv_ssthresh, tp->window_clamp);
     535           4 :         tp->snd_cwnd_stamp = tcp_jiffies32;
     536           4 :         tp->rcvq_space.space = min3(tp->rcv_ssthresh, tp->rcv_wnd,
     537             :                                     (u32)TCP_INIT_CWND * tp->advmss);
     538           4 : }
     539             : 
     540             : /* 4. Recalculate window clamp after socket hit its memory bounds. */
     541           0 : static void tcp_clamp_window(struct sock *sk)
     542             : {
     543           0 :         struct tcp_sock *tp = tcp_sk(sk);
     544           0 :         struct inet_connection_sock *icsk = inet_csk(sk);
     545           0 :         struct net *net = sock_net(sk);
     546             : 
     547           0 :         icsk->icsk_ack.quick = 0;
     548             : 
     549           0 :         if (sk->sk_rcvbuf < net->ipv4.sysctl_tcp_rmem[2] &&
     550           0 :             !(sk->sk_userlocks & SOCK_RCVBUF_LOCK) &&
     551           0 :             !tcp_under_memory_pressure(sk) &&
     552           0 :             sk_memory_allocated(sk) < sk_prot_mem_limits(sk, 0)) {
     553           0 :                 WRITE_ONCE(sk->sk_rcvbuf,
     554             :                            min(atomic_read(&sk->sk_rmem_alloc),
     555             :                                net->ipv4.sysctl_tcp_rmem[2]));
     556             :         }
     557           0 :         if (atomic_read(&sk->sk_rmem_alloc) > sk->sk_rcvbuf)
     558           0 :                 tp->rcv_ssthresh = min(tp->window_clamp, 2U * tp->advmss);
     559           0 : }
     560             : 
     561             : /* Initialize RCV_MSS value.
     562             :  * RCV_MSS is an our guess about MSS used by the peer.
     563             :  * We haven't any direct information about the MSS.
     564             :  * It's better to underestimate the RCV_MSS rather than overestimate.
     565             :  * Overestimations make us ACKing less frequently than needed.
     566             :  * Underestimations are more easy to detect and fix by tcp_measure_rcv_mss().
     567             :  */
     568           8 : void tcp_initialize_rcv_mss(struct sock *sk)
     569             : {
     570           8 :         const struct tcp_sock *tp = tcp_sk(sk);
     571           8 :         unsigned int hint = min_t(unsigned int, tp->advmss, tp->mss_cache);
     572             : 
     573           8 :         hint = min(hint, tp->rcv_wnd / 2);
     574           8 :         hint = min(hint, TCP_MSS_DEFAULT);
     575           8 :         hint = max(hint, TCP_MIN_MSS);
     576             : 
     577           4 :         inet_csk(sk)->icsk_ack.rcv_mss = hint;
     578           4 : }
     579             : EXPORT_SYMBOL(tcp_initialize_rcv_mss);
     580             : 
     581             : /* Receiver "autotuning" code.
     582             :  *
     583             :  * The algorithm for RTT estimation w/o timestamps is based on
     584             :  * Dynamic Right-Sizing (DRS) by Wu Feng and Mike Fisk of LANL.
     585             :  * <https://public.lanl.gov/radiant/pubs.html#DRS>
     586             :  *
     587             :  * More detail on this code can be found at
     588             :  * <http://staff.psc.edu/jheffner/>,
     589             :  * though this reference is out of date.  A new paper
     590             :  * is pending.
     591             :  */
     592           3 : static void tcp_rcv_rtt_update(struct tcp_sock *tp, u32 sample, int win_dep)
     593             : {
     594           3 :         u32 new_sample = tp->rcv_rtt_est.rtt_us;
     595           3 :         long m = sample;
     596             : 
     597           3 :         if (new_sample != 0) {
     598             :                 /* If we sample in larger samples in the non-timestamp
     599             :                  * case, we could grossly overestimate the RTT especially
     600             :                  * with chatty applications or bulk transfer apps which
     601             :                  * are stalled on filesystem I/O.
     602             :                  *
     603             :                  * Also, since we are only going for a minimum in the
     604             :                  * non-timestamp case, we do not smooth things out
     605             :                  * else with timestamps disabled convergence takes too
     606             :                  * long.
     607             :                  */
     608           2 :                 if (!win_dep) {
     609           0 :                         m -= (new_sample >> 3);
     610           0 :                         new_sample += m;
     611             :                 } else {
     612           2 :                         m <<= 3;
     613           2 :                         if (m < new_sample)
     614           1 :                                 new_sample = m;
     615             :                 }
     616             :         } else {
     617             :                 /* No previous measure. */
     618           1 :                 new_sample = m << 3;
     619             :         }
     620             : 
     621           3 :         tp->rcv_rtt_est.rtt_us = new_sample;
     622           3 : }
     623             : 
     624          67 : static inline void tcp_rcv_rtt_measure(struct tcp_sock *tp)
     625             : {
     626          67 :         u32 delta_us;
     627             : 
     628          67 :         if (tp->rcv_rtt_est.time == 0)
     629           4 :                 goto new_measure;
     630          63 :         if (before(tp->rcv_nxt, tp->rcv_rtt_est.seq))
     631             :                 return;
     632           3 :         delta_us = tcp_stamp_us_delta(tp->tcp_mstamp, tp->rcv_rtt_est.time);
     633           3 :         if (!delta_us)
     634             :                 delta_us = 1;
     635           3 :         tcp_rcv_rtt_update(tp, delta_us, 1);
     636             : 
     637           7 : new_measure:
     638           7 :         tp->rcv_rtt_est.seq = tp->rcv_nxt + tp->rcv_wnd;
     639           7 :         tp->rcv_rtt_est.time = tp->tcp_mstamp;
     640             : }
     641             : 
     642          70 : static inline void tcp_rcv_rtt_measure_ts(struct sock *sk,
     643             :                                           const struct sk_buff *skb)
     644             : {
     645          70 :         struct tcp_sock *tp = tcp_sk(sk);
     646             : 
     647          70 :         if (tp->rx_opt.rcv_tsecr == tp->rcv_rtt_last_tsecr)
     648             :                 return;
     649           0 :         tp->rcv_rtt_last_tsecr = tp->rx_opt.rcv_tsecr;
     650             : 
     651           0 :         if (TCP_SKB_CB(skb)->end_seq -
     652           0 :             TCP_SKB_CB(skb)->seq >= inet_csk(sk)->icsk_ack.rcv_mss) {
     653           0 :                 u32 delta = tcp_time_stamp(tp) - tp->rx_opt.rcv_tsecr;
     654           0 :                 u32 delta_us;
     655             : 
     656           0 :                 if (likely(delta < INT_MAX / (USEC_PER_SEC / TCP_TS_HZ))) {
     657           0 :                         if (!delta)
     658             :                                 delta = 1;
     659           0 :                         delta_us = delta * (USEC_PER_SEC / TCP_TS_HZ);
     660           0 :                         tcp_rcv_rtt_update(tp, delta_us, 0);
     661             :                 }
     662             :         }
     663             : }
     664             : 
     665             : /*
     666             :  * This function should be called every time data is copied to user space.
     667             :  * It calculates the appropriate TCP receive buffer space.
     668             :  */
     669         215 : void tcp_rcv_space_adjust(struct sock *sk)
     670             : {
     671         215 :         struct tcp_sock *tp = tcp_sk(sk);
     672         215 :         u32 copied;
     673         215 :         int time;
     674             : 
     675         215 :         trace_tcp_rcv_space_adjust(sk);
     676             : 
     677         215 :         tcp_mstamp_refresh(tp);
     678         215 :         time = tcp_stamp_us_delta(tp->tcp_mstamp, tp->rcvq_space.time);
     679         215 :         if (time < (tp->rcv_rtt_est.rtt_us >> 3) || tp->rcv_rtt_est.rtt_us == 0)
     680             :                 return;
     681             : 
     682             :         /* Number of bytes copied to user in last RTT */
     683           8 :         copied = tp->copied_seq - tp->rcvq_space.seq;
     684           8 :         if (copied <= tp->rcvq_space.space)
     685           7 :                 goto new_measure;
     686             : 
     687             :         /* A bit of theory :
     688             :          * copied = bytes received in previous RTT, our base window
     689             :          * To cope with packet losses, we need a 2x factor
     690             :          * To cope with slow start, and sender growing its cwin by 100 %
     691             :          * every RTT, we need a 4x factor, because the ACK we are sending
     692             :          * now is for the next RTT, not the current one :
     693             :          * <prev RTT . ><current RTT .. ><next RTT .... >
     694             :          */
     695             : 
     696           1 :         if (sock_net(sk)->ipv4.sysctl_tcp_moderate_rcvbuf &&
     697           1 :             !(sk->sk_userlocks & SOCK_RCVBUF_LOCK)) {
     698           1 :                 int rcvmem, rcvbuf;
     699           1 :                 u64 rcvwin, grow;
     700             : 
     701             :                 /* minimal window to cope with packet losses, assuming
     702             :                  * steady state. Add some cushion because of small variations.
     703             :                  */
     704           1 :                 rcvwin = ((u64)copied << 1) + 16 * tp->advmss;
     705             : 
     706             :                 /* Accommodate for sender rate increase (eg. slow start) */
     707           1 :                 grow = rcvwin * (copied - tp->rcvq_space.space);
     708           1 :                 do_div(grow, tp->rcvq_space.space);
     709           1 :                 rcvwin += (grow << 1);
     710             : 
     711           1 :                 rcvmem = SKB_TRUESIZE(tp->advmss + MAX_TCP_HEADER);
     712          14 :                 while (tcp_win_from_space(sk, rcvmem) < tp->advmss)
     713           6 :                         rcvmem += 128;
     714             : 
     715           1 :                 do_div(rcvwin, tp->advmss);
     716           1 :                 rcvbuf = min_t(u64, rcvwin * rcvmem,
     717             :                                sock_net(sk)->ipv4.sysctl_tcp_rmem[2]);
     718           1 :                 if (rcvbuf > sk->sk_rcvbuf) {
     719           1 :                         WRITE_ONCE(sk->sk_rcvbuf, rcvbuf);
     720             : 
     721             :                         /* Make the window clamp follow along.  */
     722           2 :                         tp->window_clamp = tcp_win_from_space(sk, rcvbuf);
     723             :                 }
     724             :         }
     725           1 :         tp->rcvq_space.space = copied;
     726             : 
     727           8 : new_measure:
     728           8 :         tp->rcvq_space.seq = tp->copied_seq;
     729           8 :         tp->rcvq_space.time = tp->tcp_mstamp;
     730             : }
     731             : 
     732             : /* There is something which you must keep in mind when you analyze the
     733             :  * behavior of the tp->ato delayed ack timeout interval.  When a
     734             :  * connection starts up, we want to ack as quickly as possible.  The
     735             :  * problem is that "good" TCP's do slow start at the beginning of data
     736             :  * transmission.  The means that until we send the first few ACK's the
     737             :  * sender will sit on his end and only queue most of his data, because
     738             :  * he can only send snd_cwnd unacked packets at any given time.  For
     739             :  * each ACK we send, he increments snd_cwnd and transmits more of his
     740             :  * queue.  -DaveM
     741             :  */
     742          67 : static void tcp_event_data_recv(struct sock *sk, struct sk_buff *skb)
     743             : {
     744          67 :         struct tcp_sock *tp = tcp_sk(sk);
     745          67 :         struct inet_connection_sock *icsk = inet_csk(sk);
     746          67 :         u32 now;
     747             : 
     748          67 :         inet_csk_schedule_ack(sk);
     749             : 
     750          67 :         tcp_measure_rcv_mss(sk, skb);
     751             : 
     752          67 :         tcp_rcv_rtt_measure(tp);
     753             : 
     754          67 :         now = tcp_jiffies32;
     755             : 
     756          67 :         if (!icsk->icsk_ack.ato) {
     757             :                 /* The _first_ data packet received, initialize
     758             :                  * delayed ACK engine.
     759             :                  */
     760           4 :                 tcp_incr_quickack(sk, TCP_MAX_QUICKACKS);
     761           4 :                 icsk->icsk_ack.ato = TCP_ATO_MIN;
     762             :         } else {
     763          63 :                 int m = now - icsk->icsk_ack.lrcvtime;
     764             : 
     765          63 :                 if (m <= TCP_ATO_MIN / 2) {
     766             :                         /* The fastest case is the first. */
     767          50 :                         icsk->icsk_ack.ato = (icsk->icsk_ack.ato >> 1) + TCP_ATO_MIN / 2;
     768          13 :                 } else if (m < icsk->icsk_ack.ato) {
     769           1 :                         icsk->icsk_ack.ato = (icsk->icsk_ack.ato >> 1) + m;
     770           1 :                         if (icsk->icsk_ack.ato > icsk->icsk_rto)
     771           0 :                                 icsk->icsk_ack.ato = icsk->icsk_rto;
     772          12 :                 } else if (m > icsk->icsk_rto) {
     773             :                         /* Too long gap. Apparently sender failed to
     774             :                          * restart window, so that we send ACKs quickly.
     775             :                          */
     776           4 :                         tcp_incr_quickack(sk, TCP_MAX_QUICKACKS);
     777           4 :                         sk_mem_reclaim(sk);
     778             :                 }
     779             :         }
     780          67 :         icsk->icsk_ack.lrcvtime = now;
     781             : 
     782          67 :         tcp_ecn_check_ce(sk, skb);
     783             : 
     784          67 :         if (skb->len >= 128)
     785          33 :                 tcp_grow_window(sk, skb);
     786          67 : }
     787             : 
     788             : /* Called to compute a smoothed rtt estimate. The data fed to this
     789             :  * routine either comes from timestamps, or from segments that were
     790             :  * known _not_ to have been retransmitted [see Karn/Partridge
     791             :  * Proceedings SIGCOMM 87]. The algorithm is from the SIGCOMM 88
     792             :  * piece by Van Jacobson.
     793             :  * NOTE: the next three routines used to be one big routine.
     794             :  * To save cycles in the RFC 1323 implementation it was better to break
     795             :  * it up into three procedures. -- erics
     796             :  */
     797         355 : static void tcp_rtt_estimator(struct sock *sk, long mrtt_us)
     798             : {
     799         355 :         struct tcp_sock *tp = tcp_sk(sk);
     800         355 :         long m = mrtt_us; /* RTT */
     801         355 :         u32 srtt = tp->srtt_us;
     802             : 
     803             :         /*      The following amusing code comes from Jacobson's
     804             :          *      article in SIGCOMM '88.  Note that rtt and mdev
     805             :          *      are scaled versions of rtt and mean deviation.
     806             :          *      This is designed to be as fast as possible
     807             :          *      m stands for "measurement".
     808             :          *
     809             :          *      On a 1990 paper the rto value is changed to:
     810             :          *      RTO = rtt + 4 * mdev
     811             :          *
     812             :          * Funny. This algorithm seems to be very broken.
     813             :          * These formulae increase RTO, when it should be decreased, increase
     814             :          * too slowly, when it should be increased quickly, decrease too quickly
     815             :          * etc. I guess in BSD RTO takes ONE value, so that it is absolutely
     816             :          * does not matter how to _calculate_ it. Seems, it was trap
     817             :          * that VJ failed to avoid. 8)
     818             :          */
     819         355 :         if (srtt != 0) {
     820         351 :                 m -= (srtt >> 3); /* m is now error in rtt est */
     821         351 :                 srtt += m;              /* rtt = 7/8 rtt + 1/8 new */
     822         351 :                 if (m < 0) {
     823         237 :                         m = -m;         /* m is now abs(error) */
     824         237 :                         m -= (tp->mdev_us >> 2);   /* similar update on mdev */
     825             :                         /* This is similar to one of Eifel findings.
     826             :                          * Eifel blocks mdev updates when rtt decreases.
     827             :                          * This solution is a bit different: we use finer gain
     828             :                          * for mdev in this case (alpha*beta).
     829             :                          * Like Eifel it also prevents growth of rto,
     830             :                          * but also it limits too fast rto decreases,
     831             :                          * happening in pure Eifel.
     832             :                          */
     833         237 :                         if (m > 0)
     834          70 :                                 m >>= 3;
     835             :                 } else {
     836         114 :                         m -= (tp->mdev_us >> 2);   /* similar update on mdev */
     837             :                 }
     838         351 :                 tp->mdev_us += m;            /* mdev = 3/4 mdev + 1/4 new */
     839         351 :                 if (tp->mdev_us > tp->mdev_max_us) {
     840           0 :                         tp->mdev_max_us = tp->mdev_us;
     841           0 :                         if (tp->mdev_max_us > tp->rttvar_us)
     842           0 :                                 tp->rttvar_us = tp->mdev_max_us;
     843             :                 }
     844         351 :                 if (after(tp->snd_una, tp->rtt_seq)) {
     845         254 :                         if (tp->mdev_max_us < tp->rttvar_us)
     846           0 :                                 tp->rttvar_us -= (tp->rttvar_us - tp->mdev_max_us) >> 2;
     847         254 :                         tp->rtt_seq = tp->snd_nxt;
     848         254 :                         tp->mdev_max_us = tcp_rto_min_us(sk);
     849             : 
     850         254 :                         tcp_bpf_rtt(sk);
     851             :                 }
     852             :         } else {
     853             :                 /* no previous measure. */
     854           4 :                 srtt = m << 3;            /* take the measured time to be rtt */
     855           4 :                 tp->mdev_us = m << 1;  /* make sure rto = 3*rtt */
     856           4 :                 tp->rttvar_us = max(tp->mdev_us, tcp_rto_min_us(sk));
     857           4 :                 tp->mdev_max_us = tp->rttvar_us;
     858           4 :                 tp->rtt_seq = tp->snd_nxt;
     859             : 
     860           4 :                 tcp_bpf_rtt(sk);
     861             :         }
     862         355 :         tp->srtt_us = max(1U, srtt);
     863         355 : }
     864             : 
     865         355 : static void tcp_update_pacing_rate(struct sock *sk)
     866             : {
     867         355 :         const struct tcp_sock *tp = tcp_sk(sk);
     868         355 :         u64 rate;
     869             : 
     870             :         /* set sk_pacing_rate to 200 % of current rate (mss * cwnd / srtt) */
     871         355 :         rate = (u64)tp->mss_cache * ((USEC_PER_SEC / 100) << 3);
     872             : 
     873             :         /* current rate is (cwnd * mss) / srtt
     874             :          * In Slow Start [1], set sk_pacing_rate to 200 % the current rate.
     875             :          * In Congestion Avoidance phase, set it to 120 % the current rate.
     876             :          *
     877             :          * [1] : Normal Slow Start condition is (tp->snd_cwnd < tp->snd_ssthresh)
     878             :          *       If snd_cwnd >= (tp->snd_ssthresh / 2), we are approaching
     879             :          *       end of slow start and should slow down.
     880             :          */
     881         355 :         if (tp->snd_cwnd < tp->snd_ssthresh / 2)
     882         355 :                 rate *= sock_net(sk)->ipv4.sysctl_tcp_pacing_ss_ratio;
     883             :         else
     884           0 :                 rate *= sock_net(sk)->ipv4.sysctl_tcp_pacing_ca_ratio;
     885             : 
     886         355 :         rate *= max(tp->snd_cwnd, tp->packets_out);
     887             : 
     888         355 :         if (likely(tp->srtt_us))
     889         355 :                 do_div(rate, tp->srtt_us);
     890             : 
     891             :         /* WRITE_ONCE() is needed because sch_fq fetches sk_pacing_rate
     892             :          * without any lock. We want to make sure compiler wont store
     893             :          * intermediate values in this location.
     894             :          */
     895         355 :         WRITE_ONCE(sk->sk_pacing_rate, min_t(u64, rate,
     896             :                                              sk->sk_max_pacing_rate));
     897         355 : }
     898             : 
     899             : /* Calculate rto without backoff.  This is the second half of Van Jacobson's
     900             :  * routine referred to above.
     901             :  */
     902         355 : static void tcp_set_rto(struct sock *sk)
     903             : {
     904         355 :         const struct tcp_sock *tp = tcp_sk(sk);
     905             :         /* Old crap is replaced with new one. 8)
     906             :          *
     907             :          * More seriously:
     908             :          * 1. If rtt variance happened to be less 50msec, it is hallucination.
     909             :          *    It cannot be less due to utterly erratic ACK generation made
     910             :          *    at least by solaris and freebsd. "Erratic ACKs" has _nothing_
     911             :          *    to do with delayed acks, because at cwnd>2 true delack timeout
     912             :          *    is invisible. Actually, Linux-2.4 also generates erratic
     913             :          *    ACKs in some circumstances.
     914             :          */
     915         355 :         inet_csk(sk)->icsk_rto = __tcp_set_rto(tp);
     916             : 
     917             :         /* 2. Fixups made earlier cannot be right.
     918             :          *    If we do not estimate RTO correctly without them,
     919             :          *    all the algo is pure shit and should be replaced
     920             :          *    with correct one. It is exactly, which we pretend to do.
     921             :          */
     922             : 
     923             :         /* NOTE: clamping at TCP_RTO_MIN is not required, current algo
     924             :          * guarantees that rto is higher.
     925             :          */
     926         355 :         tcp_bound_rto(sk);
     927         355 : }
     928             : 
     929           8 : __u32 tcp_init_cwnd(const struct tcp_sock *tp, const struct dst_entry *dst)
     930             : {
     931           8 :         __u32 cwnd = (dst ? dst_metric(dst, RTAX_INITCWND) : 0);
     932             : 
     933           8 :         if (!cwnd)
     934             :                 cwnd = TCP_INIT_CWND;
     935           8 :         return min_t(__u32, cwnd, tp->snd_cwnd_clamp);
     936             : }
     937             : 
     938             : struct tcp_sacktag_state {
     939             :         /* Timestamps for earliest and latest never-retransmitted segment
     940             :          * that was SACKed. RTO needs the earliest RTT to stay conservative,
     941             :          * but congestion control should still get an accurate delay signal.
     942             :          */
     943             :         u64     first_sackt;
     944             :         u64     last_sackt;
     945             :         u32     reord;
     946             :         u32     sack_delivered;
     947             :         int     flag;
     948             :         unsigned int mss_now;
     949             :         struct rate_sample *rate;
     950             : };
     951             : 
     952             : /* Take a notice that peer is sending D-SACKs. Skip update of data delivery
     953             :  * and spurious retransmission information if this DSACK is unlikely caused by
     954             :  * sender's action:
     955             :  * - DSACKed sequence range is larger than maximum receiver's window.
     956             :  * - Total no. of DSACKed segments exceed the total no. of retransmitted segs.
     957             :  */
     958           0 : static u32 tcp_dsack_seen(struct tcp_sock *tp, u32 start_seq,
     959             :                           u32 end_seq, struct tcp_sacktag_state *state)
     960             : {
     961           0 :         u32 seq_len, dup_segs = 1;
     962             : 
     963           0 :         if (!before(start_seq, end_seq))
     964             :                 return 0;
     965             : 
     966           0 :         seq_len = end_seq - start_seq;
     967             :         /* Dubious DSACK: DSACKed range greater than maximum advertised rwnd */
     968           0 :         if (seq_len > tp->max_window)
     969             :                 return 0;
     970           0 :         if (seq_len > tp->mss_cache)
     971           0 :                 dup_segs = DIV_ROUND_UP(seq_len, tp->mss_cache);
     972             : 
     973           0 :         tp->dsack_dups += dup_segs;
     974             :         /* Skip the DSACK if dup segs weren't retransmitted by sender */
     975           0 :         if (tp->dsack_dups > tp->total_retrans)
     976             :                 return 0;
     977             : 
     978           0 :         tp->rx_opt.sack_ok |= TCP_DSACK_SEEN;
     979           0 :         tp->rack.dsack_seen = 1;
     980             : 
     981           0 :         state->flag |= FLAG_DSACKING_ACK;
     982             :         /* A spurious retransmission is delivered */
     983           0 :         state->sack_delivered += dup_segs;
     984             : 
     985           0 :         return dup_segs;
     986             : }
     987             : 
     988             : /* It's reordering when higher sequence was delivered (i.e. sacked) before
     989             :  * some lower never-retransmitted sequence ("low_seq"). The maximum reordering
     990             :  * distance is approximated in full-mss packet distance ("reordering").
     991             :  */
     992           0 : static void tcp_check_sack_reordering(struct sock *sk, const u32 low_seq,
     993             :                                       const int ts)
     994             : {
     995           0 :         struct tcp_sock *tp = tcp_sk(sk);
     996           0 :         const u32 mss = tp->mss_cache;
     997           0 :         u32 fack, metric;
     998             : 
     999           0 :         fack = tcp_highest_sack_seq(tp);
    1000           0 :         if (!before(low_seq, fack))
    1001             :                 return;
    1002             : 
    1003           0 :         metric = fack - low_seq;
    1004           0 :         if ((metric > tp->reordering * mss) && mss) {
    1005             : #if FASTRETRANS_DEBUG > 1
    1006             :                 pr_debug("Disorder%d %d %u f%u s%u rr%d\n",
    1007             :                          tp->rx_opt.sack_ok, inet_csk(sk)->icsk_ca_state,
    1008             :                          tp->reordering,
    1009             :                          0,
    1010             :                          tp->sacked_out,
    1011             :                          tp->undo_marker ? tp->undo_retrans : 0);
    1012             : #endif
    1013           0 :                 tp->reordering = min_t(u32, (metric + mss - 1) / mss,
    1014             :                                        sock_net(sk)->ipv4.sysctl_tcp_max_reordering);
    1015             :         }
    1016             : 
    1017             :         /* This exciting event is worth to be remembered. 8) */
    1018           0 :         tp->reord_seen++;
    1019           0 :         NET_INC_STATS(sock_net(sk),
    1020             :                       ts ? LINUX_MIB_TCPTSREORDER : LINUX_MIB_TCPSACKREORDER);
    1021             : }
    1022             : 
    1023             :  /* This must be called before lost_out or retrans_out are updated
    1024             :   * on a new loss, because we want to know if all skbs previously
    1025             :   * known to be lost have already been retransmitted, indicating
    1026             :   * that this newly lost skb is our next skb to retransmit.
    1027             :   */
    1028           0 : static void tcp_verify_retransmit_hint(struct tcp_sock *tp, struct sk_buff *skb)
    1029             : {
    1030           0 :         if ((!tp->retransmit_skb_hint && tp->retrans_out >= tp->lost_out) ||
    1031           0 :             (tp->retransmit_skb_hint &&
    1032           0 :              before(TCP_SKB_CB(skb)->seq,
    1033             :                     TCP_SKB_CB(tp->retransmit_skb_hint)->seq)))
    1034           0 :                 tp->retransmit_skb_hint = skb;
    1035           0 : }
    1036             : 
    1037             : /* Sum the number of packets on the wire we have marked as lost, and
    1038             :  * notify the congestion control module that the given skb was marked lost.
    1039             :  */
    1040           0 : static void tcp_notify_skb_loss_event(struct tcp_sock *tp, const struct sk_buff *skb)
    1041             : {
    1042           0 :         tp->lost += tcp_skb_pcount(skb);
    1043           0 : }
    1044             : 
    1045           0 : void tcp_mark_skb_lost(struct sock *sk, struct sk_buff *skb)
    1046             : {
    1047           0 :         __u8 sacked = TCP_SKB_CB(skb)->sacked;
    1048           0 :         struct tcp_sock *tp = tcp_sk(sk);
    1049             : 
    1050           0 :         if (sacked & TCPCB_SACKED_ACKED)
    1051             :                 return;
    1052             : 
    1053           0 :         tcp_verify_retransmit_hint(tp, skb);
    1054           0 :         if (sacked & TCPCB_LOST) {
    1055           0 :                 if (sacked & TCPCB_SACKED_RETRANS) {
    1056             :                         /* Account for retransmits that are lost again */
    1057           0 :                         TCP_SKB_CB(skb)->sacked &= ~TCPCB_SACKED_RETRANS;
    1058           0 :                         tp->retrans_out -= tcp_skb_pcount(skb);
    1059           0 :                         NET_ADD_STATS(sock_net(sk), LINUX_MIB_TCPLOSTRETRANSMIT,
    1060             :                                       tcp_skb_pcount(skb));
    1061           0 :                         tcp_notify_skb_loss_event(tp, skb);
    1062             :                 }
    1063             :         } else {
    1064           0 :                 tp->lost_out += tcp_skb_pcount(skb);
    1065           0 :                 TCP_SKB_CB(skb)->sacked |= TCPCB_LOST;
    1066           0 :                 tcp_notify_skb_loss_event(tp, skb);
    1067             :         }
    1068             : }
    1069             : 
    1070             : /* Updates the delivered and delivered_ce counts */
    1071         351 : static void tcp_count_delivered(struct tcp_sock *tp, u32 delivered,
    1072             :                                 bool ece_ack)
    1073             : {
    1074         351 :         tp->delivered += delivered;
    1075         351 :         if (ece_ack)
    1076           0 :                 tp->delivered_ce += delivered;
    1077             : }
    1078             : 
    1079             : /* This procedure tags the retransmission queue when SACKs arrive.
    1080             :  *
    1081             :  * We have three tag bits: SACKED(S), RETRANS(R) and LOST(L).
    1082             :  * Packets in queue with these bits set are counted in variables
    1083             :  * sacked_out, retrans_out and lost_out, correspondingly.
    1084             :  *
    1085             :  * Valid combinations are:
    1086             :  * Tag  InFlight        Description
    1087             :  * 0    1               - orig segment is in flight.
    1088             :  * S    0               - nothing flies, orig reached receiver.
    1089             :  * L    0               - nothing flies, orig lost by net.
    1090             :  * R    2               - both orig and retransmit are in flight.
    1091             :  * L|R  1               - orig is lost, retransmit is in flight.
    1092             :  * S|R  1               - orig reached receiver, retrans is still in flight.
    1093             :  * (L|S|R is logically valid, it could occur when L|R is sacked,
    1094             :  *  but it is equivalent to plain S and code short-curcuits it to S.
    1095             :  *  L|S is logically invalid, it would mean -1 packet in flight 8))
    1096             :  *
    1097             :  * These 6 states form finite state machine, controlled by the following events:
    1098             :  * 1. New ACK (+SACK) arrives. (tcp_sacktag_write_queue())
    1099             :  * 2. Retransmission. (tcp_retransmit_skb(), tcp_xmit_retransmit_queue())
    1100             :  * 3. Loss detection event of two flavors:
    1101             :  *      A. Scoreboard estimator decided the packet is lost.
    1102             :  *         A'. Reno "three dupacks" marks head of queue lost.
    1103             :  *      B. SACK arrives sacking SND.NXT at the moment, when the
    1104             :  *         segment was retransmitted.
    1105             :  * 4. D-SACK added new rule: D-SACK changes any tag to S.
    1106             :  *
    1107             :  * It is pleasant to note, that state diagram turns out to be commutative,
    1108             :  * so that we are allowed not to be bothered by order of our actions,
    1109             :  * when multiple events arrive simultaneously. (see the function below).
    1110             :  *
    1111             :  * Reordering detection.
    1112             :  * --------------------
    1113             :  * Reordering metric is maximal distance, which a packet can be displaced
    1114             :  * in packet stream. With SACKs we can estimate it:
    1115             :  *
    1116             :  * 1. SACK fills old hole and the corresponding segment was not
    1117             :  *    ever retransmitted -> reordering. Alas, we cannot use it
    1118             :  *    when segment was retransmitted.
    1119             :  * 2. The last flaw is solved with D-SACK. D-SACK arrives
    1120             :  *    for retransmitted and already SACKed segment -> reordering..
    1121             :  * Both of these heuristics are not used in Loss state, when we cannot
    1122             :  * account for retransmits accurately.
    1123             :  *
    1124             :  * SACK block validation.
    1125             :  * ----------------------
    1126             :  *
    1127             :  * SACK block range validation checks that the received SACK block fits to
    1128             :  * the expected sequence limits, i.e., it is between SND.UNA and SND.NXT.
    1129             :  * Note that SND.UNA is not included to the range though being valid because
    1130             :  * it means that the receiver is rather inconsistent with itself reporting
    1131             :  * SACK reneging when it should advance SND.UNA. Such SACK block this is
    1132             :  * perfectly valid, however, in light of RFC2018 which explicitly states
    1133             :  * that "SACK block MUST reflect the newest segment.  Even if the newest
    1134             :  * segment is going to be discarded ...", not that it looks very clever
    1135             :  * in case of head skb. Due to potentional receiver driven attacks, we
    1136             :  * choose to avoid immediate execution of a walk in write queue due to
    1137             :  * reneging and defer head skb's loss recovery to standard loss recovery
    1138             :  * procedure that will eventually trigger (nothing forbids us doing this).
    1139             :  *
    1140             :  * Implements also blockage to start_seq wrap-around. Problem lies in the
    1141             :  * fact that though start_seq (s) is before end_seq (i.e., not reversed),
    1142             :  * there's no guarantee that it will be before snd_nxt (n). The problem
    1143             :  * happens when start_seq resides between end_seq wrap (e_w) and snd_nxt
    1144             :  * wrap (s_w):
    1145             :  *
    1146             :  *         <- outs wnd ->                          <- wrapzone ->
    1147             :  *         u     e      n                         u_w   e_w  s n_w
    1148             :  *         |     |      |                          |     |   |  |
    1149             :  * |<------------+------+----- TCP seqno space --------------+---------->|
    1150             :  * ...-- <2^31 ->|                                           |<--------...
    1151             :  * ...---- >2^31 ------>|                                    |<--------...
    1152             :  *
    1153             :  * Current code wouldn't be vulnerable but it's better still to discard such
    1154             :  * crazy SACK blocks. Doing this check for start_seq alone closes somewhat
    1155             :  * similar case (end_seq after snd_nxt wrap) as earlier reversed check in
    1156             :  * snd_nxt wrap -> snd_una region will then become "well defined", i.e.,
    1157             :  * equal to the ideal case (infinite seqno space without wrap caused issues).
    1158             :  *
    1159             :  * With D-SACK the lower bound is extended to cover sequence space below
    1160             :  * SND.UNA down to undo_marker, which is the last point of interest. Yet
    1161             :  * again, D-SACK block must not to go across snd_una (for the same reason as
    1162             :  * for the normal SACK blocks, explained above). But there all simplicity
    1163             :  * ends, TCP might receive valid D-SACKs below that. As long as they reside
    1164             :  * fully below undo_marker they do not affect behavior in anyway and can
    1165             :  * therefore be safely ignored. In rare cases (which are more or less
    1166             :  * theoretical ones), the D-SACK will nicely cross that boundary due to skb
    1167             :  * fragmentation and packet reordering past skb's retransmission. To consider
    1168             :  * them correctly, the acceptable range must be extended even more though
    1169             :  * the exact amount is rather hard to quantify. However, tp->max_window can
    1170             :  * be used as an exaggerated estimate.
    1171             :  */
    1172           0 : static bool tcp_is_sackblock_valid(struct tcp_sock *tp, bool is_dsack,
    1173             :                                    u32 start_seq, u32 end_seq)
    1174             : {
    1175             :         /* Too far in future, or reversed (interpretation is ambiguous) */
    1176           0 :         if (after(end_seq, tp->snd_nxt) || !before(start_seq, end_seq))
    1177             :                 return false;
    1178             : 
    1179             :         /* Nasty start_seq wrap-around check (see comments above) */
    1180           0 :         if (!before(start_seq, tp->snd_nxt))
    1181             :                 return false;
    1182             : 
    1183             :         /* In outstanding window? ...This is valid exit for D-SACKs too.
    1184             :          * start_seq == snd_una is non-sensical (see comments above)
    1185             :          */
    1186           0 :         if (after(start_seq, tp->snd_una))
    1187             :                 return true;
    1188             : 
    1189           0 :         if (!is_dsack || !tp->undo_marker)
    1190             :                 return false;
    1191             : 
    1192             :         /* ...Then it's D-SACK, and must reside below snd_una completely */
    1193           0 :         if (after(end_seq, tp->snd_una))
    1194             :                 return false;
    1195             : 
    1196           0 :         if (!before(start_seq, tp->undo_marker))
    1197             :                 return true;
    1198             : 
    1199             :         /* Too old */
    1200           0 :         if (!after(end_seq, tp->undo_marker))
    1201             :                 return false;
    1202             : 
    1203             :         /* Undo_marker boundary crossing (overestimates a lot). Known already:
    1204             :          *   start_seq < undo_marker and end_seq >= undo_marker.
    1205             :          */
    1206           0 :         return !before(start_seq, end_seq - tp->max_window);
    1207             : }
    1208             : 
    1209           0 : static bool tcp_check_dsack(struct sock *sk, const struct sk_buff *ack_skb,
    1210             :                             struct tcp_sack_block_wire *sp, int num_sacks,
    1211             :                             u32 prior_snd_una, struct tcp_sacktag_state *state)
    1212             : {
    1213           0 :         struct tcp_sock *tp = tcp_sk(sk);
    1214           0 :         u32 start_seq_0 = get_unaligned_be32(&sp[0].start_seq);
    1215           0 :         u32 end_seq_0 = get_unaligned_be32(&sp[0].end_seq);
    1216           0 :         u32 dup_segs;
    1217             : 
    1218           0 :         if (before(start_seq_0, TCP_SKB_CB(ack_skb)->ack_seq)) {
    1219           0 :                 NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPDSACKRECV);
    1220           0 :         } else if (num_sacks > 1) {
    1221           0 :                 u32 end_seq_1 = get_unaligned_be32(&sp[1].end_seq);
    1222           0 :                 u32 start_seq_1 = get_unaligned_be32(&sp[1].start_seq);
    1223             : 
    1224           0 :                 if (after(end_seq_0, end_seq_1) || before(start_seq_0, start_seq_1))
    1225             :                         return false;
    1226           0 :                 NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPDSACKOFORECV);
    1227             :         } else {
    1228             :                 return false;
    1229             :         }
    1230             : 
    1231           0 :         dup_segs = tcp_dsack_seen(tp, start_seq_0, end_seq_0, state);
    1232           0 :         if (!dup_segs) {        /* Skip dubious DSACK */
    1233           0 :                 NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPDSACKIGNOREDDUBIOUS);
    1234           0 :                 return false;
    1235             :         }
    1236             : 
    1237           0 :         NET_ADD_STATS(sock_net(sk), LINUX_MIB_TCPDSACKRECVSEGS, dup_segs);
    1238             : 
    1239             :         /* D-SACK for already forgotten data... Do dumb counting. */
    1240           0 :         if (tp->undo_marker && tp->undo_retrans > 0 &&
    1241           0 :             !after(end_seq_0, prior_snd_una) &&
    1242           0 :             after(end_seq_0, tp->undo_marker))
    1243           0 :                 tp->undo_retrans = max_t(int, 0, tp->undo_retrans - dup_segs);
    1244             : 
    1245             :         return true;
    1246             : }
    1247             : 
    1248             : /* Check if skb is fully within the SACK block. In presence of GSO skbs,
    1249             :  * the incoming SACK may not exactly match but we can find smaller MSS
    1250             :  * aligned portion of it that matches. Therefore we might need to fragment
    1251             :  * which may fail and creates some hassle (caller must handle error case
    1252             :  * returns).
    1253             :  *
    1254             :  * FIXME: this could be merged to shift decision code
    1255             :  */
    1256           0 : static int tcp_match_skb_to_sack(struct sock *sk, struct sk_buff *skb,
    1257             :                                   u32 start_seq, u32 end_seq)
    1258             : {
    1259           0 :         int err;
    1260           0 :         bool in_sack;
    1261           0 :         unsigned int pkt_len;
    1262           0 :         unsigned int mss;
    1263             : 
    1264           0 :         in_sack = !after(start_seq, TCP_SKB_CB(skb)->seq) &&
    1265           0 :                   !before(end_seq, TCP_SKB_CB(skb)->end_seq);
    1266             : 
    1267           0 :         if (tcp_skb_pcount(skb) > 1 && !in_sack &&
    1268           0 :             after(TCP_SKB_CB(skb)->end_seq, start_seq)) {
    1269           0 :                 mss = tcp_skb_mss(skb);
    1270           0 :                 in_sack = !after(start_seq, TCP_SKB_CB(skb)->seq);
    1271             : 
    1272           0 :                 if (!in_sack) {
    1273           0 :                         pkt_len = start_seq - TCP_SKB_CB(skb)->seq;
    1274           0 :                         if (pkt_len < mss)
    1275             :                                 pkt_len = mss;
    1276             :                 } else {
    1277           0 :                         pkt_len = end_seq - TCP_SKB_CB(skb)->seq;
    1278           0 :                         if (pkt_len < mss)
    1279             :                                 return -EINVAL;
    1280             :                 }
    1281             : 
    1282             :                 /* Round if necessary so that SACKs cover only full MSSes
    1283             :                  * and/or the remaining small portion (if present)
    1284             :                  */
    1285           0 :                 if (pkt_len > mss) {
    1286           0 :                         unsigned int new_len = (pkt_len / mss) * mss;
    1287           0 :                         if (!in_sack && new_len < pkt_len)
    1288           0 :                                 new_len += mss;
    1289             :                         pkt_len = new_len;
    1290             :                 }
    1291             : 
    1292           0 :                 if (pkt_len >= skb->len && !in_sack)
    1293             :                         return 0;
    1294             : 
    1295           0 :                 err = tcp_fragment(sk, TCP_FRAG_IN_RTX_QUEUE, skb,
    1296             :                                    pkt_len, mss, GFP_ATOMIC);
    1297           0 :                 if (err < 0)
    1298             :                         return err;
    1299             :         }
    1300             : 
    1301           0 :         return in_sack;
    1302             : }
    1303             : 
    1304             : /* Mark the given newly-SACKed range as such, adjusting counters and hints. */
    1305           0 : static u8 tcp_sacktag_one(struct sock *sk,
    1306             :                           struct tcp_sacktag_state *state, u8 sacked,
    1307             :                           u32 start_seq, u32 end_seq,
    1308             :                           int dup_sack, int pcount,
    1309             :                           u64 xmit_time)
    1310             : {
    1311           0 :         struct tcp_sock *tp = tcp_sk(sk);
    1312             : 
    1313             :         /* Account D-SACK for retransmitted packet. */
    1314           0 :         if (dup_sack && (sacked & TCPCB_RETRANS)) {
    1315           0 :                 if (tp->undo_marker && tp->undo_retrans > 0 &&
    1316           0 :                     after(end_seq, tp->undo_marker))
    1317           0 :                         tp->undo_retrans--;
    1318           0 :                 if ((sacked & TCPCB_SACKED_ACKED) &&
    1319           0 :                     before(start_seq, state->reord))
    1320           0 :                                 state->reord = start_seq;
    1321             :         }
    1322             : 
    1323             :         /* Nothing to do; acked frame is about to be dropped (was ACKed). */
    1324           0 :         if (!after(end_seq, tp->snd_una))
    1325             :                 return sacked;
    1326             : 
    1327           0 :         if (!(sacked & TCPCB_SACKED_ACKED)) {
    1328           0 :                 tcp_rack_advance(tp, sacked, end_seq, xmit_time);
    1329             : 
    1330           0 :                 if (sacked & TCPCB_SACKED_RETRANS) {
    1331             :                         /* If the segment is not tagged as lost,
    1332             :                          * we do not clear RETRANS, believing
    1333             :                          * that retransmission is still in flight.
    1334             :                          */
    1335           0 :                         if (sacked & TCPCB_LOST) {
    1336           0 :                                 sacked &= ~(TCPCB_LOST|TCPCB_SACKED_RETRANS);
    1337           0 :                                 tp->lost_out -= pcount;
    1338           0 :                                 tp->retrans_out -= pcount;
    1339             :                         }
    1340             :                 } else {
    1341           0 :                         if (!(sacked & TCPCB_RETRANS)) {
    1342             :                                 /* New sack for not retransmitted frame,
    1343             :                                  * which was in hole. It is reordering.
    1344             :                                  */
    1345           0 :                                 if (before(start_seq,
    1346           0 :                                            tcp_highest_sack_seq(tp)) &&
    1347           0 :                                     before(start_seq, state->reord))
    1348           0 :                                         state->reord = start_seq;
    1349             : 
    1350           0 :                                 if (!after(end_seq, tp->high_seq))
    1351           0 :                                         state->flag |= FLAG_ORIG_SACK_ACKED;
    1352           0 :                                 if (state->first_sackt == 0)
    1353           0 :                                         state->first_sackt = xmit_time;
    1354           0 :                                 state->last_sackt = xmit_time;
    1355             :                         }
    1356             : 
    1357           0 :                         if (sacked & TCPCB_LOST) {
    1358           0 :                                 sacked &= ~TCPCB_LOST;
    1359           0 :                                 tp->lost_out -= pcount;
    1360             :                         }
    1361             :                 }
    1362             : 
    1363           0 :                 sacked |= TCPCB_SACKED_ACKED;
    1364           0 :                 state->flag |= FLAG_DATA_SACKED;
    1365           0 :                 tp->sacked_out += pcount;
    1366             :                 /* Out-of-order packets delivered */
    1367           0 :                 state->sack_delivered += pcount;
    1368             : 
    1369             :                 /* Lost marker hint past SACKed? Tweak RFC3517 cnt */
    1370           0 :                 if (tp->lost_skb_hint &&
    1371           0 :                     before(start_seq, TCP_SKB_CB(tp->lost_skb_hint)->seq))
    1372           0 :                         tp->lost_cnt_hint += pcount;
    1373             :         }
    1374             : 
    1375             :         /* D-SACK. We can detect redundant retransmission in S|R and plain R
    1376             :          * frames and clear it. undo_retrans is decreased above, L|R frames
    1377             :          * are accounted above as well.
    1378             :          */
    1379           0 :         if (dup_sack && (sacked & TCPCB_SACKED_RETRANS)) {
    1380           0 :                 sacked &= ~TCPCB_SACKED_RETRANS;
    1381           0 :                 tp->retrans_out -= pcount;
    1382             :         }
    1383             : 
    1384             :         return sacked;
    1385             : }
    1386             : 
    1387             : /* Shift newly-SACKed bytes from this skb to the immediately previous
    1388             :  * already-SACKed sk_buff. Mark the newly-SACKed bytes as such.
    1389             :  */
    1390           0 : static bool tcp_shifted_skb(struct sock *sk, struct sk_buff *prev,
    1391             :                             struct sk_buff *skb,
    1392             :                             struct tcp_sacktag_state *state,
    1393             :                             unsigned int pcount, int shifted, int mss,
    1394             :                             bool dup_sack)
    1395             : {
    1396           0 :         struct tcp_sock *tp = tcp_sk(sk);
    1397           0 :         u32 start_seq = TCP_SKB_CB(skb)->seq;        /* start of newly-SACKed */
    1398           0 :         u32 end_seq = start_seq + shifted;      /* end of newly-SACKed */
    1399             : 
    1400           0 :         BUG_ON(!pcount);
    1401             : 
    1402             :         /* Adjust counters and hints for the newly sacked sequence
    1403             :          * range but discard the return value since prev is already
    1404             :          * marked. We must tag the range first because the seq
    1405             :          * advancement below implicitly advances
    1406             :          * tcp_highest_sack_seq() when skb is highest_sack.
    1407             :          */
    1408           0 :         tcp_sacktag_one(sk, state, TCP_SKB_CB(skb)->sacked,
    1409             :                         start_seq, end_seq, dup_sack, pcount,
    1410             :                         tcp_skb_timestamp_us(skb));
    1411           0 :         tcp_rate_skb_delivered(sk, skb, state->rate);
    1412             : 
    1413           0 :         if (skb == tp->lost_skb_hint)
    1414           0 :                 tp->lost_cnt_hint += pcount;
    1415             : 
    1416           0 :         TCP_SKB_CB(prev)->end_seq += shifted;
    1417           0 :         TCP_SKB_CB(skb)->seq += shifted;
    1418             : 
    1419           0 :         tcp_skb_pcount_add(prev, pcount);
    1420           0 :         WARN_ON_ONCE(tcp_skb_pcount(skb) < pcount);
    1421           0 :         tcp_skb_pcount_add(skb, -pcount);
    1422             : 
    1423             :         /* When we're adding to gso_segs == 1, gso_size will be zero,
    1424             :          * in theory this shouldn't be necessary but as long as DSACK
    1425             :          * code can come after this skb later on it's better to keep
    1426             :          * setting gso_size to something.
    1427             :          */
    1428           0 :         if (!TCP_SKB_CB(prev)->tcp_gso_size)
    1429           0 :                 TCP_SKB_CB(prev)->tcp_gso_size = mss;
    1430             : 
    1431             :         /* CHECKME: To clear or not to clear? Mimics normal skb currently */
    1432           0 :         if (tcp_skb_pcount(skb) <= 1)
    1433           0 :                 TCP_SKB_CB(skb)->tcp_gso_size = 0;
    1434             : 
    1435             :         /* Difference in this won't matter, both ACKed by the same cumul. ACK */
    1436           0 :         TCP_SKB_CB(prev)->sacked |= (TCP_SKB_CB(skb)->sacked & TCPCB_EVER_RETRANS);
    1437             : 
    1438           0 :         if (skb->len > 0) {
    1439           0 :                 BUG_ON(!tcp_skb_pcount(skb));
    1440           0 :                 NET_INC_STATS(sock_net(sk), LINUX_MIB_SACKSHIFTED);
    1441           0 :                 return false;
    1442             :         }
    1443             : 
    1444             :         /* Whole SKB was eaten :-) */
    1445             : 
    1446           0 :         if (skb == tp->retransmit_skb_hint)
    1447           0 :                 tp->retransmit_skb_hint = prev;
    1448           0 :         if (skb == tp->lost_skb_hint) {
    1449           0 :                 tp->lost_skb_hint = prev;
    1450           0 :                 tp->lost_cnt_hint -= tcp_skb_pcount(prev);
    1451             :         }
    1452             : 
    1453           0 :         TCP_SKB_CB(prev)->tcp_flags |= TCP_SKB_CB(skb)->tcp_flags;
    1454           0 :         TCP_SKB_CB(prev)->eor = TCP_SKB_CB(skb)->eor;
    1455           0 :         if (TCP_SKB_CB(skb)->tcp_flags & TCPHDR_FIN)
    1456           0 :                 TCP_SKB_CB(prev)->end_seq++;
    1457             : 
    1458           0 :         if (skb == tcp_highest_sack(sk))
    1459           0 :                 tcp_advance_highest_sack(sk, skb);
    1460             : 
    1461           0 :         tcp_skb_collapse_tstamp(prev, skb);
    1462           0 :         if (unlikely(TCP_SKB_CB(prev)->tx.delivered_mstamp))
    1463           0 :                 TCP_SKB_CB(prev)->tx.delivered_mstamp = 0;
    1464             : 
    1465           0 :         tcp_rtx_queue_unlink_and_free(skb, sk);
    1466             : 
    1467           0 :         NET_INC_STATS(sock_net(sk), LINUX_MIB_SACKMERGED);
    1468             : 
    1469           0 :         return true;
    1470             : }
    1471             : 
    1472             : /* I wish gso_size would have a bit more sane initialization than
    1473             :  * something-or-zero which complicates things
    1474             :  */
    1475           0 : static int tcp_skb_seglen(const struct sk_buff *skb)
    1476             : {
    1477           0 :         return tcp_skb_pcount(skb) == 1 ? skb->len : tcp_skb_mss(skb);
    1478             : }
    1479             : 
    1480             : /* Shifting pages past head area doesn't work */
    1481           0 : static int skb_can_shift(const struct sk_buff *skb)
    1482             : {
    1483           0 :         return !skb_headlen(skb) && skb_is_nonlinear(skb);
    1484             : }
    1485             : 
    1486           0 : int tcp_skb_shift(struct sk_buff *to, struct sk_buff *from,
    1487             :                   int pcount, int shiftlen)
    1488             : {
    1489             :         /* TCP min gso_size is 8 bytes (TCP_MIN_GSO_SIZE)
    1490             :          * Since TCP_SKB_CB(skb)->tcp_gso_segs is 16 bits, we need
    1491             :          * to make sure not storing more than 65535 * 8 bytes per skb,
    1492             :          * even if current MSS is bigger.
    1493             :          */
    1494           0 :         if (unlikely(to->len + shiftlen >= 65535 * TCP_MIN_GSO_SIZE))
    1495             :                 return 0;
    1496           0 :         if (unlikely(tcp_skb_pcount(to) + pcount > 65535))
    1497             :                 return 0;
    1498           0 :         return skb_shift(to, from, shiftlen);
    1499             : }
    1500             : 
    1501             : /* Try collapsing SACK blocks spanning across multiple skbs to a single
    1502             :  * skb.
    1503             :  */
    1504           0 : static struct sk_buff *tcp_shift_skb_data(struct sock *sk, struct sk_buff *skb,
    1505             :                                           struct tcp_sacktag_state *state,
    1506             :                                           u32 start_seq, u32 end_seq,
    1507             :                                           bool dup_sack)
    1508             : {
    1509           0 :         struct tcp_sock *tp = tcp_sk(sk);
    1510           0 :         struct sk_buff *prev;
    1511           0 :         int mss;
    1512           0 :         int pcount = 0;
    1513           0 :         int len;
    1514           0 :         int in_sack;
    1515             : 
    1516             :         /* Normally R but no L won't result in plain S */
    1517           0 :         if (!dup_sack &&
    1518           0 :             (TCP_SKB_CB(skb)->sacked & (TCPCB_LOST|TCPCB_SACKED_RETRANS)) == TCPCB_SACKED_RETRANS)
    1519           0 :                 goto fallback;
    1520           0 :         if (!skb_can_shift(skb))
    1521           0 :                 goto fallback;
    1522             :         /* This frame is about to be dropped (was ACKed). */
    1523           0 :         if (!after(TCP_SKB_CB(skb)->end_seq, tp->snd_una))
    1524           0 :                 goto fallback;
    1525             : 
    1526             :         /* Can only happen with delayed DSACK + discard craziness */
    1527           0 :         prev = skb_rb_prev(skb);
    1528           0 :         if (!prev)
    1529           0 :                 goto fallback;
    1530             : 
    1531           0 :         if ((TCP_SKB_CB(prev)->sacked & TCPCB_TAGBITS) != TCPCB_SACKED_ACKED)
    1532           0 :                 goto fallback;
    1533             : 
    1534           0 :         if (!tcp_skb_can_collapse(prev, skb))
    1535           0 :                 goto fallback;
    1536             : 
    1537           0 :         in_sack = !after(start_seq, TCP_SKB_CB(skb)->seq) &&
    1538           0 :                   !before(end_seq, TCP_SKB_CB(skb)->end_seq);
    1539             : 
    1540           0 :         if (in_sack) {
    1541           0 :                 len = skb->len;
    1542           0 :                 pcount = tcp_skb_pcount(skb);
    1543           0 :                 mss = tcp_skb_seglen(skb);
    1544             : 
    1545             :                 /* TODO: Fix DSACKs to not fragment already SACKed and we can
    1546             :                  * drop this restriction as unnecessary
    1547             :                  */
    1548           0 :                 if (mss != tcp_skb_seglen(prev))
    1549           0 :                         goto fallback;
    1550             :         } else {
    1551           0 :                 if (!after(TCP_SKB_CB(skb)->end_seq, start_seq))
    1552           0 :                         goto noop;
    1553             :                 /* CHECKME: This is non-MSS split case only?, this will
    1554             :                  * cause skipped skbs due to advancing loop btw, original
    1555             :                  * has that feature too
    1556             :                  */
    1557           0 :                 if (tcp_skb_pcount(skb) <= 1)
    1558           0 :                         goto noop;
    1559             : 
    1560           0 :                 in_sack = !after(start_seq, TCP_SKB_CB(skb)->seq);
    1561           0 :                 if (!in_sack) {
    1562             :                         /* TODO: head merge to next could be attempted here
    1563             :                          * if (!after(TCP_SKB_CB(skb)->end_seq, end_seq)),
    1564             :                          * though it might not be worth of the additional hassle
    1565             :                          *
    1566             :                          * ...we can probably just fallback to what was done
    1567             :                          * previously. We could try merging non-SACKed ones
    1568             :                          * as well but it probably isn't going to buy off
    1569             :                          * because later SACKs might again split them, and
    1570             :                          * it would make skb timestamp tracking considerably
    1571             :                          * harder problem.
    1572             :                          */
    1573           0 :                         goto fallback;
    1574             :                 }
    1575             : 
    1576           0 :                 len = end_seq - TCP_SKB_CB(skb)->seq;
    1577           0 :                 BUG_ON(len < 0);
    1578           0 :                 BUG_ON(len > skb->len);
    1579             : 
    1580             :                 /* MSS boundaries should be honoured or else pcount will
    1581             :                  * severely break even though it makes things bit trickier.
    1582             :                  * Optimize common case to avoid most of the divides
    1583             :                  */
    1584           0 :                 mss = tcp_skb_mss(skb);
    1585             : 
    1586             :                 /* TODO: Fix DSACKs to not fragment already SACKed and we can
    1587             :                  * drop this restriction as unnecessary
    1588             :                  */
    1589           0 :                 if (mss != tcp_skb_seglen(prev))
    1590           0 :                         goto fallback;
    1591             : 
    1592           0 :                 if (len == mss) {
    1593             :                         pcount = 1;
    1594           0 :                 } else if (len < mss) {
    1595           0 :                         goto noop;
    1596             :                 } else {
    1597           0 :                         pcount = len / mss;
    1598           0 :                         len = pcount * mss;
    1599             :                 }
    1600             :         }
    1601             : 
    1602             :         /* tcp_sacktag_one() won't SACK-tag ranges below snd_una */
    1603           0 :         if (!after(TCP_SKB_CB(skb)->seq + len, tp->snd_una))
    1604           0 :                 goto fallback;
    1605             : 
    1606           0 :         if (!tcp_skb_shift(prev, skb, pcount, len))
    1607           0 :                 goto fallback;
    1608           0 :         if (!tcp_shifted_skb(sk, prev, skb, state, pcount, len, mss, dup_sack))
    1609           0 :                 goto out;
    1610             : 
    1611             :         /* Hole filled allows collapsing with the next as well, this is very
    1612             :          * useful when hole on every nth skb pattern happens
    1613             :          */
    1614           0 :         skb = skb_rb_next(prev);
    1615           0 :         if (!skb)
    1616           0 :                 goto out;
    1617             : 
    1618           0 :         if (!skb_can_shift(skb) ||
    1619           0 :             ((TCP_SKB_CB(skb)->sacked & TCPCB_TAGBITS) != TCPCB_SACKED_ACKED) ||
    1620           0 :             (mss != tcp_skb_seglen(skb)))
    1621           0 :                 goto out;
    1622             : 
    1623           0 :         len = skb->len;
    1624           0 :         pcount = tcp_skb_pcount(skb);
    1625           0 :         if (tcp_skb_shift(prev, skb, pcount, len))
    1626           0 :                 tcp_shifted_skb(sk, prev, skb, state, pcount,
    1627             :                                 len, mss, 0);
    1628             : 
    1629           0 : out:
    1630             :         return prev;
    1631             : 
    1632             : noop:
    1633             :         return skb;
    1634             : 
    1635           0 : fallback:
    1636           0 :         NET_INC_STATS(sock_net(sk), LINUX_MIB_SACKSHIFTFALLBACK);
    1637           0 :         return NULL;
    1638             : }
    1639             : 
    1640           0 : static struct sk_buff *tcp_sacktag_walk(struct sk_buff *skb, struct sock *sk,
    1641             :                                         struct tcp_sack_block *next_dup,
    1642             :                                         struct tcp_sacktag_state *state,
    1643             :                                         u32 start_seq, u32 end_seq,
    1644             :                                         bool dup_sack_in)
    1645             : {
    1646           0 :         struct tcp_sock *tp = tcp_sk(sk);
    1647           0 :         struct sk_buff *tmp;
    1648             : 
    1649           0 :         skb_rbtree_walk_from(skb) {
    1650           0 :                 int in_sack = 0;
    1651           0 :                 bool dup_sack = dup_sack_in;
    1652             : 
    1653             :                 /* queue is in-order => we can short-circuit the walk early */
    1654           0 :                 if (!before(TCP_SKB_CB(skb)->seq, end_seq))
    1655             :                         break;
    1656             : 
    1657           0 :                 if (next_dup  &&
    1658           0 :                     before(TCP_SKB_CB(skb)->seq, next_dup->end_seq)) {
    1659           0 :                         in_sack = tcp_match_skb_to_sack(sk, skb,
    1660             :                                                         next_dup->start_seq,
    1661             :                                                         next_dup->end_seq);
    1662           0 :                         if (in_sack > 0)
    1663           0 :                                 dup_sack = true;
    1664             :                 }
    1665             : 
    1666             :                 /* skb reference here is a bit tricky to get right, since
    1667             :                  * shifting can eat and free both this skb and the next,
    1668             :                  * so not even _safe variant of the loop is enough.
    1669             :                  */
    1670           0 :                 if (in_sack <= 0) {
    1671           0 :                         tmp = tcp_shift_skb_data(sk, skb, state,
    1672             :                                                  start_seq, end_seq, dup_sack);
    1673           0 :                         if (tmp) {
    1674           0 :                                 if (tmp != skb) {
    1675           0 :                                         skb = tmp;
    1676           0 :                                         continue;
    1677             :                                 }
    1678             : 
    1679             :                                 in_sack = 0;
    1680             :                         } else {
    1681           0 :                                 in_sack = tcp_match_skb_to_sack(sk, skb,
    1682             :                                                                 start_seq,
    1683             :                                                                 end_seq);
    1684             :                         }
    1685             :                 }
    1686             : 
    1687           0 :                 if (unlikely(in_sack < 0))
    1688             :                         break;
    1689             : 
    1690           0 :                 if (in_sack) {
    1691           0 :                         TCP_SKB_CB(skb)->sacked =
    1692           0 :                                 tcp_sacktag_one(sk,
    1693             :                                                 state,
    1694           0 :                                                 TCP_SKB_CB(skb)->sacked,
    1695             :                                                 TCP_SKB_CB(skb)->seq,
    1696             :                                                 TCP_SKB_CB(skb)->end_seq,
    1697             :                                                 dup_sack,
    1698             :                                                 tcp_skb_pcount(skb),
    1699             :                                                 tcp_skb_timestamp_us(skb));
    1700           0 :                         tcp_rate_skb_delivered(sk, skb, state->rate);
    1701           0 :                         if (TCP_SKB_CB(skb)->sacked & TCPCB_SACKED_ACKED)
    1702           0 :                                 list_del_init(&skb->tcp_tsorted_anchor);
    1703             : 
    1704           0 :                         if (!before(TCP_SKB_CB(skb)->seq,
    1705             :                                     tcp_highest_sack_seq(tp)))
    1706           0 :                                 tcp_advance_highest_sack(sk, skb);
    1707             :                 }
    1708             :         }
    1709           0 :         return skb;
    1710             : }
    1711             : 
    1712           0 : static struct sk_buff *tcp_sacktag_bsearch(struct sock *sk, u32 seq)
    1713             : {
    1714           0 :         struct rb_node *parent, **p = &sk->tcp_rtx_queue.rb_node;
    1715           0 :         struct sk_buff *skb;
    1716             : 
    1717           0 :         while (*p) {
    1718           0 :                 parent = *p;
    1719           0 :                 skb = rb_to_skb(parent);
    1720           0 :                 if (before(seq, TCP_SKB_CB(skb)->seq)) {
    1721           0 :                         p = &parent->rb_left;
    1722           0 :                         continue;
    1723             :                 }
    1724           0 :                 if (!before(seq, TCP_SKB_CB(skb)->end_seq)) {
    1725           0 :                         p = &parent->rb_right;
    1726           0 :                         continue;
    1727             :                 }
    1728             :                 return skb;
    1729             :         }
    1730             :         return NULL;
    1731             : }
    1732             : 
    1733           0 : static struct sk_buff *tcp_sacktag_skip(struct sk_buff *skb, struct sock *sk,
    1734             :                                         u32 skip_to_seq)
    1735             : {
    1736           0 :         if (skb && after(TCP_SKB_CB(skb)->seq, skip_to_seq))
    1737             :                 return skb;
    1738             : 
    1739           0 :         return tcp_sacktag_bsearch(sk, skip_to_seq);
    1740             : }
    1741             : 
    1742           0 : static struct sk_buff *tcp_maybe_skipping_dsack(struct sk_buff *skb,
    1743             :                                                 struct sock *sk,
    1744             :                                                 struct tcp_sack_block *next_dup,
    1745             :                                                 struct tcp_sacktag_state *state,
    1746             :                                                 u32 skip_to_seq)
    1747             : {
    1748           0 :         if (!next_dup)
    1749             :                 return skb;
    1750             : 
    1751           0 :         if (before(next_dup->start_seq, skip_to_seq)) {
    1752           0 :                 skb = tcp_sacktag_skip(skb, sk, next_dup->start_seq);
    1753           0 :                 skb = tcp_sacktag_walk(skb, sk, NULL, state,
    1754             :                                        next_dup->start_seq, next_dup->end_seq,
    1755             :                                        1);
    1756             :         }
    1757             : 
    1758             :         return skb;
    1759             : }
    1760             : 
    1761           0 : static int tcp_sack_cache_ok(const struct tcp_sock *tp, const struct tcp_sack_block *cache)
    1762             : {
    1763           0 :         return cache < tp->recv_sack_cache + ARRAY_SIZE(tp->recv_sack_cache);
    1764             : }
    1765             : 
    1766             : static int
    1767           0 : tcp_sacktag_write_queue(struct sock *sk, const struct sk_buff *ack_skb,
    1768             :                         u32 prior_snd_una, struct tcp_sacktag_state *state)
    1769             : {
    1770           0 :         struct tcp_sock *tp = tcp_sk(sk);
    1771           0 :         const unsigned char *ptr = (skb_transport_header(ack_skb) +
    1772           0 :                                     TCP_SKB_CB(ack_skb)->sacked);
    1773           0 :         struct tcp_sack_block_wire *sp_wire = (struct tcp_sack_block_wire *)(ptr+2);
    1774           0 :         struct tcp_sack_block sp[TCP_NUM_SACKS];
    1775           0 :         struct tcp_sack_block *cache;
    1776           0 :         struct sk_buff *skb;
    1777           0 :         int num_sacks = min(TCP_NUM_SACKS, (ptr[1] - TCPOLEN_SACK_BASE) >> 3);
    1778           0 :         int used_sacks;
    1779           0 :         bool found_dup_sack = false;
    1780           0 :         int i, j;
    1781           0 :         int first_sack_index;
    1782             : 
    1783           0 :         state->flag = 0;
    1784           0 :         state->reord = tp->snd_nxt;
    1785             : 
    1786           0 :         if (!tp->sacked_out)
    1787           0 :                 tcp_highest_sack_reset(sk);
    1788             : 
    1789           0 :         found_dup_sack = tcp_check_dsack(sk, ack_skb, sp_wire,
    1790             :                                          num_sacks, prior_snd_una, state);
    1791             : 
    1792             :         /* Eliminate too old ACKs, but take into
    1793             :          * account more or less fresh ones, they can
    1794             :          * contain valid SACK info.
    1795             :          */
    1796           0 :         if (before(TCP_SKB_CB(ack_skb)->ack_seq, prior_snd_una - tp->max_window))
    1797             :                 return 0;
    1798             : 
    1799           0 :         if (!tp->packets_out)
    1800           0 :                 goto out;
    1801             : 
    1802             :         used_sacks = 0;
    1803             :         first_sack_index = 0;
    1804           0 :         for (i = 0; i < num_sacks; i++) {
    1805           0 :                 bool dup_sack = !i && found_dup_sack;
    1806             : 
    1807           0 :                 sp[used_sacks].start_seq = get_unaligned_be32(&sp_wire[i].start_seq);
    1808           0 :                 sp[used_sacks].end_seq = get_unaligned_be32(&sp_wire[i].end_seq);
    1809             : 
    1810           0 :                 if (!tcp_is_sackblock_valid(tp, dup_sack,
    1811             :                                             sp[used_sacks].start_seq,
    1812             :                                             sp[used_sacks].end_seq)) {
    1813           0 :                         int mib_idx;
    1814             : 
    1815           0 :                         if (dup_sack) {
    1816           0 :                                 if (!tp->undo_marker)
    1817             :                                         mib_idx = LINUX_MIB_TCPDSACKIGNOREDNOUNDO;
    1818             :                                 else
    1819           0 :                                         mib_idx = LINUX_MIB_TCPDSACKIGNOREDOLD;
    1820             :                         } else {
    1821             :                                 /* Don't count olds caused by ACK reordering */
    1822           0 :                                 if ((TCP_SKB_CB(ack_skb)->ack_seq != tp->snd_una) &&
    1823           0 :                                     !after(sp[used_sacks].end_seq, tp->snd_una))
    1824           0 :                                         continue;
    1825             :                                 mib_idx = LINUX_MIB_TCPSACKDISCARD;
    1826             :                         }
    1827             : 
    1828           0 :                         NET_INC_STATS(sock_net(sk), mib_idx);
    1829           0 :                         if (i == 0)
    1830           0 :                                 first_sack_index = -1;
    1831           0 :                         continue;
    1832             :                 }
    1833             : 
    1834             :                 /* Ignore very old stuff early */
    1835           0 :                 if (!after(sp[used_sacks].end_seq, prior_snd_una)) {
    1836           0 :                         if (i == 0)
    1837           0 :                                 first_sack_index = -1;
    1838           0 :                         continue;
    1839             :                 }
    1840             : 
    1841           0 :                 used_sacks++;
    1842             :         }
    1843             : 
    1844             :         /* order SACK blocks to allow in order walk of the retrans queue */
    1845           0 :         for (i = used_sacks - 1; i > 0; i--) {
    1846           0 :                 for (j = 0; j < i; j++) {
    1847           0 :                         if (after(sp[j].start_seq, sp[j + 1].start_seq)) {
    1848           0 :                                 swap(sp[j], sp[j + 1]);
    1849             : 
    1850             :                                 /* Track where the first SACK block goes to */
    1851           0 :                                 if (j == first_sack_index)
    1852           0 :                                         first_sack_index = j + 1;
    1853             :                         }
    1854             :                 }
    1855             :         }
    1856             : 
    1857           0 :         state->mss_now = tcp_current_mss(sk);
    1858           0 :         skb = NULL;
    1859           0 :         i = 0;
    1860             : 
    1861           0 :         if (!tp->sacked_out) {
    1862             :                 /* It's already past, so skip checking against it */
    1863           0 :                 cache = tp->recv_sack_cache + ARRAY_SIZE(tp->recv_sack_cache);
    1864             :         } else {
    1865           0 :                 cache = tp->recv_sack_cache;
    1866             :                 /* Skip empty blocks in at head of the cache */
    1867           0 :                 while (tcp_sack_cache_ok(tp, cache) && !cache->start_seq &&
    1868           0 :                        !cache->end_seq)
    1869           0 :                         cache++;
    1870             :         }
    1871             : 
    1872           0 :         while (i < used_sacks) {
    1873           0 :                 u32 start_seq = sp[i].start_seq;
    1874           0 :                 u32 end_seq = sp[i].end_seq;
    1875           0 :                 bool dup_sack = (found_dup_sack && (i == first_sack_index));
    1876           0 :                 struct tcp_sack_block *next_dup = NULL;
    1877             : 
    1878           0 :                 if (found_dup_sack && ((i + 1) == first_sack_index))
    1879           0 :                         next_dup = &sp[i + 1];
    1880             : 
    1881             :                 /* Skip too early cached blocks */
    1882           0 :                 while (tcp_sack_cache_ok(tp, cache) &&
    1883           0 :                        !before(start_seq, cache->end_seq))
    1884           0 :                         cache++;
    1885             : 
    1886             :                 /* Can skip some work by looking recv_sack_cache? */
    1887           0 :                 if (tcp_sack_cache_ok(tp, cache) && !dup_sack &&
    1888           0 :                     after(end_seq, cache->start_seq)) {
    1889             : 
    1890             :                         /* Head todo? */
    1891           0 :                         if (before(start_seq, cache->start_seq)) {
    1892           0 :                                 skb = tcp_sacktag_skip(skb, sk, start_seq);
    1893           0 :                                 skb = tcp_sacktag_walk(skb, sk, next_dup,
    1894             :                                                        state,
    1895             :                                                        start_seq,
    1896             :                                                        cache->start_seq,
    1897             :                                                        dup_sack);
    1898             :                         }
    1899             : 
    1900             :                         /* Rest of the block already fully processed? */
    1901           0 :                         if (!after(end_seq, cache->end_seq))
    1902           0 :                                 goto advance_sp;
    1903             : 
    1904           0 :                         skb = tcp_maybe_skipping_dsack(skb, sk, next_dup,
    1905             :                                                        state,
    1906             :                                                        cache->end_seq);
    1907             : 
    1908             :                         /* ...tail remains todo... */
    1909           0 :                         if (tcp_highest_sack_seq(tp) == cache->end_seq) {
    1910             :                                 /* ...but better entrypoint exists! */
    1911           0 :                                 skb = tcp_highest_sack(sk);
    1912           0 :                                 if (!skb)
    1913             :                                         break;
    1914           0 :                                 cache++;
    1915           0 :                                 goto walk;
    1916             :                         }
    1917             : 
    1918           0 :                         skb = tcp_sacktag_skip(skb, sk, cache->end_seq);
    1919             :                         /* Check overlap against next cached too (past this one already) */
    1920           0 :                         cache++;
    1921           0 :                         continue;
    1922             :                 }
    1923             : 
    1924           0 :                 if (!before(start_seq, tcp_highest_sack_seq(tp))) {
    1925           0 :                         skb = tcp_highest_sack(sk);
    1926           0 :                         if (!skb)
    1927             :                                 break;
    1928             :                 }
    1929           0 :                 skb = tcp_sacktag_skip(skb, sk, start_seq);
    1930             : 
    1931           0 : walk:
    1932           0 :                 skb = tcp_sacktag_walk(skb, sk, next_dup, state,
    1933             :                                        start_seq, end_seq, dup_sack);
    1934             : 
    1935           0 : advance_sp:
    1936           0 :                 i++;
    1937             :         }
    1938             : 
    1939             :         /* Clear the head of the cache sack blocks so we can skip it next time */
    1940           0 :         for (i = 0; i < ARRAY_SIZE(tp->recv_sack_cache) - used_sacks; i++) {
    1941           0 :                 tp->recv_sack_cache[i].start_seq = 0;
    1942           0 :                 tp->recv_sack_cache[i].end_seq = 0;
    1943             :         }
    1944           0 :         for (j = 0; j < used_sacks; j++)
    1945           0 :                 tp->recv_sack_cache[i++] = sp[j];
    1946             : 
    1947           0 :         if (inet_csk(sk)->icsk_ca_state != TCP_CA_Loss || tp->undo_marker)
    1948           0 :                 tcp_check_sack_reordering(sk, state->reord, 0);
    1949             : 
    1950           0 :         tcp_verify_left_out(tp);
    1951           0 : out:
    1952             : 
    1953             : #if FASTRETRANS_DEBUG > 0
    1954           0 :         WARN_ON((int)tp->sacked_out < 0);
    1955           0 :         WARN_ON((int)tp->lost_out < 0);
    1956           0 :         WARN_ON((int)tp->retrans_out < 0);
    1957           0 :         WARN_ON((int)tcp_packets_in_flight(tp) < 0);
    1958             : #endif
    1959           0 :         return state->flag;
    1960             : }
    1961             : 
    1962             : /* Limits sacked_out so that sum with lost_out isn't ever larger than
    1963             :  * packets_out. Returns false if sacked_out adjustement wasn't necessary.
    1964             :  */
    1965         351 : static bool tcp_limit_reno_sacked(struct tcp_sock *tp)
    1966             : {
    1967         351 :         u32 holes;
    1968             : 
    1969         351 :         holes = max(tp->lost_out, 1U);
    1970         351 :         holes = min(holes, tp->packets_out);
    1971             : 
    1972         351 :         if ((tp->sacked_out + holes) > tp->packets_out) {
    1973           0 :                 tp->sacked_out = tp->packets_out - holes;
    1974           0 :                 return true;
    1975             :         }
    1976             :         return false;
    1977             : }
    1978             : 
    1979             : /* If we receive more dupacks than we expected counting segments
    1980             :  * in assumption of absent reordering, interpret this as reordering.
    1981             :  * The only another reason could be bug in receiver TCP.
    1982             :  */
    1983         351 : static void tcp_check_reno_reordering(struct sock *sk, const int addend)
    1984             : {
    1985         351 :         struct tcp_sock *tp = tcp_sk(sk);
    1986             : 
    1987         351 :         if (!tcp_limit_reno_sacked(tp))
    1988             :                 return;
    1989             : 
    1990           0 :         tp->reordering = min_t(u32, tp->packets_out + addend,
    1991             :                                sock_net(sk)->ipv4.sysctl_tcp_max_reordering);
    1992           0 :         tp->reord_seen++;
    1993           0 :         NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPRENOREORDER);
    1994             : }
    1995             : 
    1996             : /* Emulate SACKs for SACKless connection: account for a new dupack. */
    1997             : 
    1998           0 : static void tcp_add_reno_sack(struct sock *sk, int num_dupack, bool ece_ack)
    1999             : {
    2000           0 :         if (num_dupack) {
    2001           0 :                 struct tcp_sock *tp = tcp_sk(sk);
    2002           0 :                 u32 prior_sacked = tp->sacked_out;
    2003           0 :                 s32 delivered;
    2004             : 
    2005           0 :                 tp->sacked_out += num_dupack;
    2006           0 :                 tcp_check_reno_reordering(sk, 0);
    2007           0 :                 delivered = tp->sacked_out - prior_sacked;
    2008           0 :                 if (delivered > 0)
    2009           0 :                         tcp_count_delivered(tp, delivered, ece_ack);
    2010           0 :                 tcp_verify_left_out(tp);
    2011             :         }
    2012           0 : }
    2013             : 
    2014             : /* Account for ACK, ACKing some data in Reno Recovery phase. */
    2015             : 
    2016         351 : static void tcp_remove_reno_sacks(struct sock *sk, int acked, bool ece_ack)
    2017             : {
    2018         351 :         struct tcp_sock *tp = tcp_sk(sk);
    2019             : 
    2020         351 :         if (acked > 0) {
    2021             :                 /* One ACK acked hole. The rest eat duplicate ACKs. */
    2022         351 :                 tcp_count_delivered(tp, max_t(int, acked - tp->sacked_out, 1),
    2023             :                                     ece_ack);
    2024         351 :                 if (acked - 1 >= tp->sacked_out)
    2025         351 :                         tp->sacked_out = 0;
    2026             :                 else
    2027           0 :                         tp->sacked_out -= acked - 1;
    2028             :         }
    2029         351 :         tcp_check_reno_reordering(sk, acked);
    2030         351 :         tcp_verify_left_out(tp);
    2031         351 : }
    2032             : 
    2033           0 : static inline void tcp_reset_reno_sack(struct tcp_sock *tp)
    2034             : {
    2035           0 :         tp->sacked_out = 0;
    2036           0 : }
    2037             : 
    2038           0 : void tcp_clear_retrans(struct tcp_sock *tp)
    2039             : {
    2040           0 :         tp->retrans_out = 0;
    2041           0 :         tp->lost_out = 0;
    2042           0 :         tp->undo_marker = 0;
    2043           0 :         tp->undo_retrans = -1;
    2044           0 :         tp->sacked_out = 0;
    2045           0 : }
    2046             : 
    2047           0 : static inline void tcp_init_undo(struct tcp_sock *tp)
    2048             : {
    2049           0 :         tp->undo_marker = tp->snd_una;
    2050             :         /* Retransmission still in flight may cause DSACKs later. */
    2051           0 :         tp->undo_retrans = tp->retrans_out ? : -1;
    2052           0 : }
    2053             : 
    2054           0 : static bool tcp_is_rack(const struct sock *sk)
    2055             : {
    2056           0 :         return sock_net(sk)->ipv4.sysctl_tcp_recovery & TCP_RACK_LOSS_DETECTION;
    2057             : }
    2058             : 
    2059             : /* If we detect SACK reneging, forget all SACK information
    2060             :  * and reset tags completely, otherwise preserve SACKs. If receiver
    2061             :  * dropped its ofo queue, we will know this due to reneging detection.
    2062             :  */
    2063           0 : static void tcp_timeout_mark_lost(struct sock *sk)
    2064             : {
    2065           0 :         struct tcp_sock *tp = tcp_sk(sk);
    2066           0 :         struct sk_buff *skb, *head;
    2067           0 :         bool is_reneg;                  /* is receiver reneging on SACKs? */
    2068             : 
    2069           0 :         head = tcp_rtx_queue_head(sk);
    2070           0 :         is_reneg = head && (TCP_SKB_CB(head)->sacked & TCPCB_SACKED_ACKED);
    2071           0 :         if (is_reneg) {
    2072           0 :                 NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPSACKRENEGING);
    2073           0 :                 tp->sacked_out = 0;
    2074             :                 /* Mark SACK reneging until we recover from this loss event. */
    2075           0 :                 tp->is_sack_reneg = 1;
    2076           0 :         } else if (tcp_is_reno(tp)) {
    2077           0 :                 tcp_reset_reno_sack(tp);
    2078             :         }
    2079             : 
    2080             :         skb = head;
    2081           0 :         skb_rbtree_walk_from(skb) {
    2082           0 :                 if (is_reneg)
    2083           0 :                         TCP_SKB_CB(skb)->sacked &= ~TCPCB_SACKED_ACKED;
    2084           0 :                 else if (tcp_is_rack(sk) && skb != head &&
    2085           0 :                          tcp_rack_skb_timeout(tp, skb, 0) > 0)
    2086           0 :                         continue; /* Don't mark recently sent ones lost yet */
    2087           0 :                 tcp_mark_skb_lost(sk, skb);
    2088             :         }
    2089           0 :         tcp_verify_left_out(tp);
    2090           0 :         tcp_clear_all_retrans_hints(tp);
    2091           0 : }
    2092             : 
    2093             : /* Enter Loss state. */
    2094           0 : void tcp_enter_loss(struct sock *sk)
    2095             : {
    2096           0 :         const struct inet_connection_sock *icsk = inet_csk(sk);
    2097           0 :         struct tcp_sock *tp = tcp_sk(sk);
    2098           0 :         struct net *net = sock_net(sk);
    2099           0 :         bool new_recovery = icsk->icsk_ca_state < TCP_CA_Recovery;
    2100             : 
    2101           0 :         tcp_timeout_mark_lost(sk);
    2102             : 
    2103             :         /* Reduce ssthresh if it has not yet been made inside this window. */
    2104           0 :         if (icsk->icsk_ca_state <= TCP_CA_Disorder ||
    2105           0 :             !after(tp->high_seq, tp->snd_una) ||
    2106           0 :             (icsk->icsk_ca_state == TCP_CA_Loss && !icsk->icsk_retransmits)) {
    2107           0 :                 tp->prior_ssthresh = tcp_current_ssthresh(sk);
    2108           0 :                 tp->prior_cwnd = tp->snd_cwnd;
    2109           0 :                 tp->snd_ssthresh = icsk->icsk_ca_ops->ssthresh(sk);
    2110           0 :                 tcp_ca_event(sk, CA_EVENT_LOSS);
    2111           0 :                 tcp_init_undo(tp);
    2112             :         }
    2113           0 :         tp->snd_cwnd    = tcp_packets_in_flight(tp) + 1;
    2114           0 :         tp->snd_cwnd_cnt   = 0;
    2115           0 :         tp->snd_cwnd_stamp = tcp_jiffies32;
    2116             : 
    2117             :         /* Timeout in disordered state after receiving substantial DUPACKs
    2118             :          * suggests that the degree of reordering is over-estimated.
    2119             :          */
    2120           0 :         if (icsk->icsk_ca_state <= TCP_CA_Disorder &&
    2121           0 :             tp->sacked_out >= net->ipv4.sysctl_tcp_reordering)
    2122           0 :                 tp->reordering = min_t(unsigned int, tp->reordering,
    2123             :                                        net->ipv4.sysctl_tcp_reordering);
    2124           0 :         tcp_set_ca_state(sk, TCP_CA_Loss);
    2125           0 :         tp->high_seq = tp->snd_nxt;
    2126           0 :         tcp_ecn_queue_cwr(tp);
    2127             : 
    2128             :         /* F-RTO RFC5682 sec 3.1 step 1: retransmit SND.UNA if no previous
    2129             :          * loss recovery is underway except recurring timeout(s) on
    2130             :          * the same SND.UNA (sec 3.2). Disable F-RTO on path MTU probing
    2131             :          */
    2132           0 :         tp->frto = net->ipv4.sysctl_tcp_frto &&
    2133           0 :                    (new_recovery || icsk->icsk_retransmits) &&
    2134           0 :                    !inet_csk(sk)->icsk_mtup.probe_size;
    2135           0 : }
    2136             : 
    2137             : /* If ACK arrived pointing to a remembered SACK, it means that our
    2138             :  * remembered SACKs do not reflect real state of receiver i.e.
    2139             :  * receiver _host_ is heavily congested (or buggy).
    2140             :  *
    2141             :  * To avoid big spurious retransmission bursts due to transient SACK
    2142             :  * scoreboard oddities that look like reneging, we give the receiver a
    2143             :  * little time (max(RTT/2, 10ms)) to send us some more ACKs that will
    2144             :  * restore sanity to the SACK scoreboard. If the apparent reneging
    2145             :  * persists until this RTO then we'll clear the SACK scoreboard.
    2146             :  */
    2147           0 : static bool tcp_check_sack_reneging(struct sock *sk, int flag)
    2148             : {
    2149           0 :         if (flag & FLAG_SACK_RENEGING) {
    2150           0 :                 struct tcp_sock *tp = tcp_sk(sk);
    2151           0 :                 unsigned long delay = max(usecs_to_jiffies(tp->srtt_us >> 4),
    2152             :                                           msecs_to_jiffies(10));
    2153             : 
    2154           0 :                 inet_csk_reset_xmit_timer(sk, ICSK_TIME_RETRANS,
    2155             :                                           delay, TCP_RTO_MAX);
    2156           0 :                 return true;
    2157             :         }
    2158             :         return false;
    2159             : }
    2160             : 
    2161             : /* Heurestics to calculate number of duplicate ACKs. There's no dupACKs
    2162             :  * counter when SACK is enabled (without SACK, sacked_out is used for
    2163             :  * that purpose).
    2164             :  *
    2165             :  * With reordering, holes may still be in flight, so RFC3517 recovery
    2166             :  * uses pure sacked_out (total number of SACKed segments) even though
    2167             :  * it violates the RFC that uses duplicate ACKs, often these are equal
    2168             :  * but when e.g. out-of-window ACKs or packet duplication occurs,
    2169             :  * they differ. Since neither occurs due to loss, TCP should really
    2170             :  * ignore them.
    2171             :  */
    2172           0 : static inline int tcp_dupack_heuristics(const struct tcp_sock *tp)
    2173             : {
    2174           0 :         return tp->sacked_out + 1;
    2175             : }
    2176             : 
    2177             : /* Linux NewReno/SACK/ECN state machine.
    2178             :  * --------------------------------------
    2179             :  *
    2180             :  * "Open"     Normal state, no dubious events, fast path.
    2181             :  * "Disorder"   In all the respects it is "Open",
    2182             :  *              but requires a bit more attention. It is entered when
    2183             :  *              we see some SACKs or dupacks. It is split of "Open"
    2184             :  *              mainly to move some processing from fast path to slow one.
    2185             :  * "CWR"      CWND was reduced due to some Congestion Notification event.
    2186             :  *              It can be ECN, ICMP source quench, local device congestion.
    2187             :  * "Recovery" CWND was reduced, we are fast-retransmitting.
    2188             :  * "Loss"     CWND was reduced due to RTO timeout or SACK reneging.
    2189             :  *
    2190             :  * tcp_fastretrans_alert() is entered:
    2191             :  * - each incoming ACK, if state is not "Open"
    2192             :  * - when arrived ACK is unusual, namely:
    2193             :  *      * SACK
    2194             :  *      * Duplicate ACK.
    2195             :  *      * ECN ECE.
    2196             :  *
    2197             :  * Counting packets in flight is pretty simple.
    2198             :  *
    2199             :  *      in_flight = packets_out - left_out + retrans_out
    2200             :  *
    2201             :  *      packets_out is SND.NXT-SND.UNA counted in packets.
    2202             :  *
    2203             :  *      retrans_out is number of retransmitted segments.
    2204             :  *
    2205             :  *      left_out is number of segments left network, but not ACKed yet.
    2206             :  *
    2207             :  *              left_out = sacked_out + lost_out
    2208             :  *
    2209             :  *     sacked_out: Packets, which arrived to receiver out of order
    2210             :  *                 and hence not ACKed. With SACKs this number is simply
    2211             :  *                 amount of SACKed data. Even without SACKs
    2212             :  *                 it is easy to give pretty reliable estimate of this number,
    2213             :  *                 counting duplicate ACKs.
    2214             :  *
    2215             :  *       lost_out: Packets lost by network. TCP has no explicit
    2216             :  *                 "loss notification" feedback from network (for now).
    2217             :  *                 It means that this number can be only _guessed_.
    2218             :  *                 Actually, it is the heuristics to predict lossage that
    2219             :  *                 distinguishes different algorithms.
    2220             :  *
    2221             :  *      F.e. after RTO, when all the queue is considered as lost,
    2222             :  *      lost_out = packets_out and in_flight = retrans_out.
    2223             :  *
    2224             :  *              Essentially, we have now a few algorithms detecting
    2225             :  *              lost packets.
    2226             :  *
    2227             :  *              If the receiver supports SACK:
    2228             :  *
    2229             :  *              RFC6675/3517: It is the conventional algorithm. A packet is
    2230             :  *              considered lost if the number of higher sequence packets
    2231             :  *              SACKed is greater than or equal the DUPACK thoreshold
    2232             :  *              (reordering). This is implemented in tcp_mark_head_lost and
    2233             :  *              tcp_update_scoreboard.
    2234             :  *
    2235             :  *              RACK (draft-ietf-tcpm-rack-01): it is a newer algorithm
    2236             :  *              (2017-) that checks timing instead of counting DUPACKs.
    2237             :  *              Essentially a packet is considered lost if it's not S/ACKed
    2238             :  *              after RTT + reordering_window, where both metrics are
    2239             :  *              dynamically measured and adjusted. This is implemented in
    2240             :  *              tcp_rack_mark_lost.
    2241             :  *
    2242             :  *              If the receiver does not support SACK:
    2243             :  *
    2244             :  *              NewReno (RFC6582): in Recovery we assume that one segment
    2245             :  *              is lost (classic Reno). While we are in Recovery and
    2246             :  *              a partial ACK arrives, we assume that one more packet
    2247             :  *              is lost (NewReno). This heuristics are the same in NewReno
    2248             :  *              and SACK.
    2249             :  *
    2250             :  * Really tricky (and requiring careful tuning) part of algorithm
    2251             :  * is hidden in functions tcp_time_to_recover() and tcp_xmit_retransmit_queue().
    2252             :  * The first determines the moment _when_ we should reduce CWND and,
    2253             :  * hence, slow down forward transmission. In fact, it determines the moment
    2254             :  * when we decide that hole is caused by loss, rather than by a reorder.
    2255             :  *
    2256             :  * tcp_xmit_retransmit_queue() decides, _what_ we should retransmit to fill
    2257             :  * holes, caused by lost packets.
    2258             :  *
    2259             :  * And the most logically complicated part of algorithm is undo
    2260             :  * heuristics. We detect false retransmits due to both too early
    2261             :  * fast retransmit (reordering) and underestimated RTO, analyzing
    2262             :  * timestamps and D-SACKs. When we detect that some segments were
    2263             :  * retransmitted by mistake and CWND reduction was wrong, we undo
    2264             :  * window reduction and abort recovery phase. This logic is hidden
    2265             :  * inside several functions named tcp_try_undo_<something>.
    2266             :  */
    2267             : 
    2268             : /* This function decides, when we should leave Disordered state
    2269             :  * and enter Recovery phase, reducing congestion window.
    2270             :  *
    2271             :  * Main question: may we further continue forward transmission
    2272             :  * with the same cwnd?
    2273             :  */
    2274           0 : static bool tcp_time_to_recover(struct sock *sk, int flag)
    2275             : {
    2276           0 :         struct tcp_sock *tp = tcp_sk(sk);
    2277             : 
    2278             :         /* Trick#1: The loss is proven. */
    2279           0 :         if (tp->lost_out)
    2280             :                 return true;
    2281             : 
    2282             :         /* Not-A-Trick#2 : Classic rule... */
    2283           0 :         if (!tcp_is_rack(sk) && tcp_dupack_heuristics(tp) > tp->reordering)
    2284             :                 return true;
    2285             : 
    2286             :         return false;
    2287             : }
    2288             : 
    2289             : /* Detect loss in event "A" above by marking head of queue up as lost.
    2290             :  * For RFC3517 SACK, a segment is considered lost if it
    2291             :  * has at least tp->reordering SACKed seqments above it; "packets" refers to
    2292             :  * the maximum SACKed segments to pass before reaching this limit.
    2293             :  */
    2294           0 : static void tcp_mark_head_lost(struct sock *sk, int packets, int mark_head)
    2295             : {
    2296           0 :         struct tcp_sock *tp = tcp_sk(sk);
    2297           0 :         struct sk_buff *skb;
    2298           0 :         int cnt;
    2299             :         /* Use SACK to deduce losses of new sequences sent during recovery */
    2300           0 :         const u32 loss_high = tp->snd_nxt;
    2301             : 
    2302           0 :         WARN_ON(packets > tp->packets_out);
    2303           0 :         skb = tp->lost_skb_hint;
    2304           0 :         if (skb) {
    2305             :                 /* Head already handled? */
    2306           0 :                 if (mark_head && after(TCP_SKB_CB(skb)->seq, tp->snd_una))
    2307             :                         return;
    2308           0 :                 cnt = tp->lost_cnt_hint;
    2309             :         } else {
    2310           0 :                 skb = tcp_rtx_queue_head(sk);
    2311             :                 cnt = 0;
    2312             :         }
    2313             : 
    2314           0 :         skb_rbtree_walk_from(skb) {
    2315             :                 /* TODO: do this better */
    2316             :                 /* this is not the most efficient way to do this... */
    2317           0 :                 tp->lost_skb_hint = skb;
    2318           0 :                 tp->lost_cnt_hint = cnt;
    2319             : 
    2320           0 :                 if (after(TCP_SKB_CB(skb)->end_seq, loss_high))
    2321             :                         break;
    2322             : 
    2323           0 :                 if (TCP_SKB_CB(skb)->sacked & TCPCB_SACKED_ACKED)
    2324           0 :                         cnt += tcp_skb_pcount(skb);
    2325             : 
    2326           0 :                 if (cnt > packets)
    2327             :                         break;
    2328             : 
    2329           0 :                 if (!(TCP_SKB_CB(skb)->sacked & TCPCB_LOST))
    2330           0 :                         tcp_mark_skb_lost(sk, skb);
    2331             : 
    2332           0 :                 if (mark_head)
    2333             :                         break;
    2334             :         }
    2335           0 :         tcp_verify_left_out(tp);
    2336             : }
    2337             : 
    2338             : /* Account newly detected lost packet(s) */
    2339             : 
    2340           0 : static void tcp_update_scoreboard(struct sock *sk, int fast_rexmit)
    2341             : {
    2342           0 :         struct tcp_sock *tp = tcp_sk(sk);
    2343             : 
    2344           0 :         if (tcp_is_sack(tp)) {
    2345           0 :                 int sacked_upto = tp->sacked_out - tp->reordering;
    2346           0 :                 if (sacked_upto >= 0)
    2347           0 :                         tcp_mark_head_lost(sk, sacked_upto, 0);
    2348           0 :                 else if (fast_rexmit)
    2349           0 :                         tcp_mark_head_lost(sk, 1, 1);
    2350             :         }
    2351           0 : }
    2352             : 
    2353           0 : static bool tcp_tsopt_ecr_before(const struct tcp_sock *tp, u32 when)
    2354             : {
    2355           0 :         return tp->rx_opt.saw_tstamp && tp->rx_opt.rcv_tsecr &&
    2356           0 :                before(tp->rx_opt.rcv_tsecr, when);
    2357             : }
    2358             : 
    2359             : /* skb is spurious retransmitted if the returned timestamp echo
    2360             :  * reply is prior to the skb transmission time
    2361             :  */
    2362           0 : static bool tcp_skb_spurious_retrans(const struct tcp_sock *tp,
    2363             :                                      const struct sk_buff *skb)
    2364             : {
    2365           0 :         return (TCP_SKB_CB(skb)->sacked & TCPCB_RETRANS) &&
    2366           0 :                tcp_tsopt_ecr_before(tp, tcp_skb_timestamp(skb));
    2367             : }
    2368             : 
    2369             : /* Nothing was retransmitted or returned timestamp is less
    2370             :  * than timestamp of the first retransmission.
    2371             :  */
    2372           0 : static inline bool tcp_packet_delayed(const struct tcp_sock *tp)
    2373             : {
    2374           0 :         return tp->retrans_stamp &&
    2375           0 :                tcp_tsopt_ecr_before(tp, tp->retrans_stamp);
    2376             : }
    2377             : 
    2378             : /* Undo procedures. */
    2379             : 
    2380             : /* We can clear retrans_stamp when there are no retransmissions in the
    2381             :  * window. It would seem that it is trivially available for us in
    2382             :  * tp->retrans_out, however, that kind of assumptions doesn't consider
    2383             :  * what will happen if errors occur when sending retransmission for the
    2384             :  * second time. ...It could the that such segment has only
    2385             :  * TCPCB_EVER_RETRANS set at the present time. It seems that checking
    2386             :  * the head skb is enough except for some reneging corner cases that
    2387             :  * are not worth the effort.
    2388             :  *
    2389             :  * Main reason for all this complexity is the fact that connection dying
    2390             :  * time now depends on the validity of the retrans_stamp, in particular,
    2391             :  * that successive retransmissions of a segment must not advance
    2392             :  * retrans_stamp under any conditions.
    2393             :  */
    2394           0 : static bool tcp_any_retrans_done(const struct sock *sk)
    2395             : {
    2396           0 :         const struct tcp_sock *tp = tcp_sk(sk);
    2397           0 :         struct sk_buff *skb;
    2398             : 
    2399           0 :         if (tp->retrans_out)
    2400             :                 return true;
    2401             : 
    2402           0 :         skb = tcp_rtx_queue_head(sk);
    2403           0 :         if (unlikely(skb && TCP_SKB_CB(skb)->sacked & TCPCB_EVER_RETRANS))
    2404           0 :                 return true;
    2405             : 
    2406             :         return false;
    2407             : }
    2408             : 
    2409           0 : static void DBGUNDO(struct sock *sk, const char *msg)
    2410             : {
    2411             : #if FASTRETRANS_DEBUG > 1
    2412             :         struct tcp_sock *tp = tcp_sk(sk);
    2413             :         struct inet_sock *inet = inet_sk(sk);
    2414             : 
    2415             :         if (sk->sk_family == AF_INET) {
    2416             :                 pr_debug("Undo %s %pI4/%u c%u l%u ss%u/%u p%u\n",
    2417             :                          msg,
    2418             :                          &inet->inet_daddr, ntohs(inet->inet_dport),
    2419             :                          tp->snd_cwnd, tcp_left_out(tp),
    2420             :                          tp->snd_ssthresh, tp->prior_ssthresh,
    2421             :                          tp->packets_out);
    2422             :         }
    2423             : #if IS_ENABLED(CONFIG_IPV6)
    2424             :         else if (sk->sk_family == AF_INET6) {
    2425             :                 pr_debug("Undo %s %pI6/%u c%u l%u ss%u/%u p%u\n",
    2426             :                          msg,
    2427             :                          &sk->sk_v6_daddr, ntohs(inet->inet_dport),
    2428             :                          tp->snd_cwnd, tcp_left_out(tp),
    2429             :                          tp->snd_ssthresh, tp->prior_ssthresh,
    2430             :                          tp->packets_out);
    2431             :         }
    2432             : #endif
    2433             : #endif
    2434           0 : }
    2435             : 
    2436           0 : static void tcp_undo_cwnd_reduction(struct sock *sk, bool unmark_loss)
    2437             : {
    2438           0 :         struct tcp_sock *tp = tcp_sk(sk);
    2439             : 
    2440           0 :         if (unmark_loss) {
    2441           0 :                 struct sk_buff *skb;
    2442             : 
    2443           0 :                 skb_rbtree_walk(skb, &sk->tcp_rtx_queue) {
    2444           0 :                         TCP_SKB_CB(skb)->sacked &= ~TCPCB_LOST;
    2445             :                 }
    2446           0 :                 tp->lost_out = 0;
    2447           0 :                 tcp_clear_all_retrans_hints(tp);
    2448             :         }
    2449             : 
    2450           0 :         if (tp->prior_ssthresh) {
    2451           0 :                 const struct inet_connection_sock *icsk = inet_csk(sk);
    2452             : 
    2453           0 :                 tp->snd_cwnd = icsk->icsk_ca_ops->undo_cwnd(sk);
    2454             : 
    2455           0 :                 if (tp->prior_ssthresh > tp->snd_ssthresh) {
    2456           0 :                         tp->snd_ssthresh = tp->prior_ssthresh;
    2457           0 :                         tcp_ecn_withdraw_cwr(tp);
    2458             :                 }
    2459             :         }
    2460           0 :         tp->snd_cwnd_stamp = tcp_jiffies32;
    2461           0 :         tp->undo_marker = 0;
    2462           0 :         tp->rack.advanced = 1; /* Force RACK to re-exam losses */
    2463           0 : }
    2464             : 
    2465           0 : static inline bool tcp_may_undo(const struct tcp_sock *tp)
    2466             : {
    2467           0 :         return tp->undo_marker && (!tp->undo_retrans || tcp_packet_delayed(tp));
    2468             : }
    2469             : 
    2470             : /* People celebrate: "We love our President!" */
    2471           0 : static bool tcp_try_undo_recovery(struct sock *sk)
    2472             : {
    2473           0 :         struct tcp_sock *tp = tcp_sk(sk);
    2474             : 
    2475           0 :         if (tcp_may_undo(tp)) {
    2476           0 :                 int mib_idx;
    2477             : 
    2478             :                 /* Happy end! We did not retransmit anything
    2479             :                  * or our original transmission succeeded.
    2480             :                  */
    2481           0 :                 DBGUNDO(sk, inet_csk(sk)->icsk_ca_state == TCP_CA_Loss ? "loss" : "retrans");
    2482           0 :                 tcp_undo_cwnd_reduction(sk, false);
    2483           0 :                 if (inet_csk(sk)->icsk_ca_state == TCP_CA_Loss)
    2484             :                         mib_idx = LINUX_MIB_TCPLOSSUNDO;
    2485             :                 else
    2486           0 :                         mib_idx = LINUX_MIB_TCPFULLUNDO;
    2487             : 
    2488           0 :                 NET_INC_STATS(sock_net(sk), mib_idx);
    2489           0 :         } else if (tp->rack.reo_wnd_persist) {
    2490           0 :                 tp->rack.reo_wnd_persist--;
    2491             :         }
    2492           0 :         if (tp->snd_una == tp->high_seq && tcp_is_reno(tp)) {
    2493             :                 /* Hold old state until something *above* high_seq
    2494             :                  * is ACKed. For Reno it is MUST to prevent false
    2495             :                  * fast retransmits (RFC2582). SACK TCP is safe. */
    2496           0 :                 if (!tcp_any_retrans_done(sk))
    2497           0 :                         tp->retrans_stamp = 0;
    2498           0 :                 return true;
    2499             :         }
    2500           0 :         tcp_set_ca_state(sk, TCP_CA_Open);
    2501           0 :         tp->is_sack_reneg = 0;
    2502           0 :         return false;
    2503             : }
    2504             : 
    2505             : /* Try to undo cwnd reduction, because D-SACKs acked all retransmitted data */
    2506           0 : static bool tcp_try_undo_dsack(struct sock *sk)
    2507             : {
    2508           0 :         struct tcp_sock *tp = tcp_sk(sk);
    2509             : 
    2510           0 :         if (tp->undo_marker && !tp->undo_retrans) {
    2511           0 :                 tp->rack.reo_wnd_persist = min(TCP_RACK_RECOVERY_THRESH,
    2512             :                                                tp->rack.reo_wnd_persist + 1);
    2513           0 :                 DBGUNDO(sk, "D-SACK");
    2514           0 :                 tcp_undo_cwnd_reduction(sk, false);
    2515           0 :                 NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPDSACKUNDO);
    2516           0 :                 return true;
    2517             :         }
    2518             :         return false;
    2519             : }
    2520             : 
    2521             : /* Undo during loss recovery after partial ACK or using F-RTO. */
    2522           0 : static bool tcp_try_undo_loss(struct sock *sk, bool frto_undo)
    2523             : {
    2524           0 :         struct tcp_sock *tp = tcp_sk(sk);
    2525             : 
    2526           0 :         if (frto_undo || tcp_may_undo(tp)) {
    2527           0 :                 tcp_undo_cwnd_reduction(sk, true);
    2528             : 
    2529           0 :                 DBGUNDO(sk, "partial loss");
    2530           0 :                 NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPLOSSUNDO);
    2531           0 :                 if (frto_undo)
    2532           0 :                         NET_INC_STATS(sock_net(sk),
    2533             :                                         LINUX_MIB_TCPSPURIOUSRTOS);
    2534           0 :                 inet_csk(sk)->icsk_retransmits = 0;
    2535           0 :                 if (frto_undo || tcp_is_sack(tp)) {
    2536           0 :                         tcp_set_ca_state(sk, TCP_CA_Open);
    2537           0 :                         tp->is_sack_reneg = 0;
    2538             :                 }
    2539           0 :                 return true;
    2540             :         }
    2541             :         return false;
    2542             : }
    2543             : 
    2544             : /* The cwnd reduction in CWR and Recovery uses the PRR algorithm in RFC 6937.
    2545             :  * It computes the number of packets to send (sndcnt) based on packets newly
    2546             :  * delivered:
    2547             :  *   1) If the packets in flight is larger than ssthresh, PRR spreads the
    2548             :  *      cwnd reductions across a full RTT.
    2549             :  *   2) Otherwise PRR uses packet conservation to send as much as delivered.
    2550             :  *      But when SND_UNA is acked without further losses,
    2551             :  *      slow starts cwnd up to ssthresh to speed up the recovery.
    2552             :  */
    2553           0 : static void tcp_init_cwnd_reduction(struct sock *sk)
    2554             : {
    2555           0 :         struct tcp_sock *tp = tcp_sk(sk);
    2556             : 
    2557           0 :         tp->high_seq = tp->snd_nxt;
    2558           0 :         tp->tlp_high_seq = 0;
    2559           0 :         tp->snd_cwnd_cnt = 0;
    2560           0 :         tp->prior_cwnd = tp->snd_cwnd;
    2561           0 :         tp->prr_delivered = 0;
    2562           0 :         tp->prr_out = 0;
    2563           0 :         tp->snd_ssthresh = inet_csk(sk)->icsk_ca_ops->ssthresh(sk);
    2564           0 :         tcp_ecn_queue_cwr(tp);
    2565           0 : }
    2566             : 
    2567           0 : void tcp_cwnd_reduction(struct sock *sk, int newly_acked_sacked, int newly_lost, int flag)
    2568             : {
    2569           0 :         struct tcp_sock *tp = tcp_sk(sk);
    2570           0 :         int sndcnt = 0;
    2571           0 :         int delta = tp->snd_ssthresh - tcp_packets_in_flight(tp);
    2572             : 
    2573           0 :         if (newly_acked_sacked <= 0 || WARN_ON_ONCE(!tp->prior_cwnd))
    2574             :                 return;
    2575             : 
    2576           0 :         tp->prr_delivered += newly_acked_sacked;
    2577           0 :         if (delta < 0) {
    2578           0 :                 u64 dividend = (u64)tp->snd_ssthresh * tp->prr_delivered +
    2579           0 :                                tp->prior_cwnd - 1;
    2580           0 :                 sndcnt = div_u64(dividend, tp->prior_cwnd) - tp->prr_out;
    2581           0 :         } else if (flag & FLAG_SND_UNA_ADVANCED && !newly_lost) {
    2582           0 :                 sndcnt = min_t(int, delta,
    2583             :                                max_t(int, tp->prr_delivered - tp->prr_out,
    2584             :                                      newly_acked_sacked) + 1);
    2585             :         } else {
    2586           0 :                 sndcnt = min(delta, newly_acked_sacked);
    2587             :         }
    2588             :         /* Force a fast retransmit upon entering fast recovery */
    2589           0 :         sndcnt = max(sndcnt, (tp->prr_out ? 0 : 1));
    2590           0 :         tp->snd_cwnd = tcp_packets_in_flight(tp) + sndcnt;
    2591             : }
    2592             : 
    2593           0 : static inline void tcp_end_cwnd_reduction(struct sock *sk)
    2594             : {
    2595           0 :         struct tcp_sock *tp = tcp_sk(sk);
    2596             : 
    2597           0 :         if (inet_csk(sk)->icsk_ca_ops->cong_control)
    2598             :                 return;
    2599             : 
    2600             :         /* Reset cwnd to ssthresh in CWR or Recovery (unless it's undone) */
    2601           0 :         if (tp->snd_ssthresh < TCP_INFINITE_SSTHRESH &&
    2602           0 :             (inet_csk(sk)->icsk_ca_state == TCP_CA_CWR || tp->undo_marker)) {
    2603           0 :                 tp->snd_cwnd = tp->snd_ssthresh;
    2604           0 :                 tp->snd_cwnd_stamp = tcp_jiffies32;
    2605             :         }
    2606           0 :         tcp_ca_event(sk, CA_EVENT_COMPLETE_CWR);
    2607             : }
    2608             : 
    2609             : /* Enter CWR state. Disable cwnd undo since congestion is proven with ECN */
    2610           0 : void tcp_enter_cwr(struct sock *sk)
    2611             : {
    2612           0 :         struct tcp_sock *tp = tcp_sk(sk);
    2613             : 
    2614           0 :         tp->prior_ssthresh = 0;
    2615           0 :         if (inet_csk(sk)->icsk_ca_state < TCP_CA_CWR) {
    2616           0 :                 tp->undo_marker = 0;
    2617           0 :                 tcp_init_cwnd_reduction(sk);
    2618           0 :                 tcp_set_ca_state(sk, TCP_CA_CWR);
    2619             :         }
    2620           0 : }
    2621             : EXPORT_SYMBOL(tcp_enter_cwr);
    2622             : 
    2623           0 : static void tcp_try_keep_open(struct sock *sk)
    2624             : {
    2625           0 :         struct tcp_sock *tp = tcp_sk(sk);
    2626           0 :         int state = TCP_CA_Open;
    2627             : 
    2628           0 :         if (tcp_left_out(tp) || tcp_any_retrans_done(sk))
    2629             :                 state = TCP_CA_Disorder;
    2630             : 
    2631           0 :         if (inet_csk(sk)->icsk_ca_state != state) {
    2632           0 :                 tcp_set_ca_state(sk, state);
    2633           0 :                 tp->high_seq = tp->snd_nxt;
    2634             :         }
    2635           0 : }
    2636             : 
    2637           0 : static void tcp_try_to_open(struct sock *sk, int flag)
    2638             : {
    2639           0 :         struct tcp_sock *tp = tcp_sk(sk);
    2640             : 
    2641           0 :         tcp_verify_left_out(tp);
    2642             : 
    2643           0 :         if (!tcp_any_retrans_done(sk))
    2644           0 :                 tp->retrans_stamp = 0;
    2645             : 
    2646           0 :         if (flag & FLAG_ECE)
    2647           0 :                 tcp_enter_cwr(sk);
    2648             : 
    2649           0 :         if (inet_csk(sk)->icsk_ca_state != TCP_CA_CWR) {
    2650           0 :                 tcp_try_keep_open(sk);
    2651             :         }
    2652           0 : }
    2653             : 
    2654           0 : static void tcp_mtup_probe_failed(struct sock *sk)
    2655             : {
    2656           0 :         struct inet_connection_sock *icsk = inet_csk(sk);
    2657             : 
    2658           0 :         icsk->icsk_mtup.search_high = icsk->icsk_mtup.probe_size - 1;
    2659           0 :         icsk->icsk_mtup.probe_size = 0;
    2660           0 :         NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPMTUPFAIL);
    2661             : }
    2662             : 
    2663           0 : static void tcp_mtup_probe_success(struct sock *sk)
    2664             : {
    2665           0 :         struct tcp_sock *tp = tcp_sk(sk);
    2666           0 :         struct inet_connection_sock *icsk = inet_csk(sk);
    2667             : 
    2668             :         /* FIXME: breaks with very large cwnd */
    2669           0 :         tp->prior_ssthresh = tcp_current_ssthresh(sk);
    2670           0 :         tp->snd_cwnd = tp->snd_cwnd *
    2671           0 :                        tcp_mss_to_mtu(sk, tp->mss_cache) /
    2672           0 :                        icsk->icsk_mtup.probe_size;
    2673           0 :         tp->snd_cwnd_cnt = 0;
    2674           0 :         tp->snd_cwnd_stamp = tcp_jiffies32;
    2675           0 :         tp->snd_ssthresh = tcp_current_ssthresh(sk);
    2676             : 
    2677           0 :         icsk->icsk_mtup.search_low = icsk->icsk_mtup.probe_size;
    2678           0 :         icsk->icsk_mtup.probe_size = 0;
    2679           0 :         tcp_sync_mss(sk, icsk->icsk_pmtu_cookie);
    2680           0 :         NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPMTUPSUCCESS);
    2681           0 : }
    2682             : 
    2683             : /* Do a simple retransmit without using the backoff mechanisms in
    2684             :  * tcp_timer. This is used for path mtu discovery.
    2685             :  * The socket is already locked here.
    2686             :  */
    2687           0 : void tcp_simple_retransmit(struct sock *sk)
    2688             : {
    2689           0 :         const struct inet_connection_sock *icsk = inet_csk(sk);
    2690           0 :         struct tcp_sock *tp = tcp_sk(sk);
    2691           0 :         struct sk_buff *skb;
    2692           0 :         int mss;
    2693             : 
    2694             :         /* A fastopen SYN request is stored as two separate packets within
    2695             :          * the retransmit queue, this is done by tcp_send_syn_data().
    2696             :          * As a result simply checking the MSS of the frames in the queue
    2697             :          * will not work for the SYN packet.
    2698             :          *
    2699             :          * Us being here is an indication of a path MTU issue so we can
    2700             :          * assume that the fastopen SYN was lost and just mark all the
    2701             :          * frames in the retransmit queue as lost. We will use an MSS of
    2702             :          * -1 to mark all frames as lost, otherwise compute the current MSS.
    2703             :          */
    2704           0 :         if (tp->syn_data && sk->sk_state == TCP_SYN_SENT)
    2705             :                 mss = -1;
    2706             :         else
    2707           0 :                 mss = tcp_current_mss(sk);
    2708             : 
    2709           0 :         skb_rbtree_walk(skb, &sk->tcp_rtx_queue) {
    2710           0 :                 if (tcp_skb_seglen(skb) > mss)
    2711           0 :                         tcp_mark_skb_lost(sk, skb);
    2712             :         }
    2713             : 
    2714           0 :         tcp_clear_retrans_hints_partial(tp);
    2715             : 
    2716           0 :         if (!tp->lost_out)
    2717             :                 return;
    2718             : 
    2719           0 :         if (tcp_is_reno(tp))
    2720           0 :                 tcp_limit_reno_sacked(tp);
    2721             : 
    2722           0 :         tcp_verify_left_out(tp);
    2723             : 
    2724             :         /* Don't muck with the congestion window here.
    2725             :          * Reason is that we do not increase amount of _data_
    2726             :          * in network, but units changed and effective
    2727             :          * cwnd/ssthresh really reduced now.
    2728             :          */
    2729           0 :         if (icsk->icsk_ca_state != TCP_CA_Loss) {
    2730           0 :                 tp->high_seq = tp->snd_nxt;
    2731           0 :                 tp->snd_ssthresh = tcp_current_ssthresh(sk);
    2732           0 :                 tp->prior_ssthresh = 0;
    2733           0 :                 tp->undo_marker = 0;
    2734           0 :                 tcp_set_ca_state(sk, TCP_CA_Loss);
    2735             :         }
    2736           0 :         tcp_xmit_retransmit_queue(sk);
    2737             : }
    2738             : EXPORT_SYMBOL(tcp_simple_retransmit);
    2739             : 
    2740           0 : void tcp_enter_recovery(struct sock *sk, bool ece_ack)
    2741             : {
    2742           0 :         struct tcp_sock *tp = tcp_sk(sk);
    2743           0 :         int mib_idx;
    2744             : 
    2745           0 :         if (tcp_is_reno(tp))
    2746             :                 mib_idx = LINUX_MIB_TCPRENORECOVERY;
    2747             :         else
    2748           0 :                 mib_idx = LINUX_MIB_TCPSACKRECOVERY;
    2749             : 
    2750           0 :         NET_INC_STATS(sock_net(sk), mib_idx);
    2751             : 
    2752           0 :         tp->prior_ssthresh = 0;
    2753           0 :         tcp_init_undo(tp);
    2754             : 
    2755           0 :         if (!tcp_in_cwnd_reduction(sk)) {
    2756           0 :                 if (!ece_ack)
    2757           0 :                         tp->prior_ssthresh = tcp_current_ssthresh(sk);
    2758           0 :                 tcp_init_cwnd_reduction(sk);
    2759             :         }
    2760           0 :         tcp_set_ca_state(sk, TCP_CA_Recovery);
    2761           0 : }
    2762             : 
    2763             : /* Process an ACK in CA_Loss state. Move to CA_Open if lost data are
    2764             :  * recovered or spurious. Otherwise retransmits more on partial ACKs.
    2765             :  */
    2766           0 : static void tcp_process_loss(struct sock *sk, int flag, int num_dupack,
    2767             :                              int *rexmit)
    2768             : {
    2769           0 :         struct tcp_sock *tp = tcp_sk(sk);
    2770           0 :         bool recovered = !before(tp->snd_una, tp->high_seq);
    2771             : 
    2772           0 :         if ((flag & FLAG_SND_UNA_ADVANCED || rcu_access_pointer(tp->fastopen_rsk)) &&
    2773           0 :             tcp_try_undo_loss(sk, false))
    2774             :                 return;
    2775             : 
    2776           0 :         if (tp->frto) { /* F-RTO RFC5682 sec 3.1 (sack enhanced version). */
    2777             :                 /* Step 3.b. A timeout is spurious if not all data are
    2778             :                  * lost, i.e., never-retransmitted data are (s)acked.
    2779             :                  */
    2780           0 :                 if ((flag & FLAG_ORIG_SACK_ACKED) &&
    2781           0 :                     tcp_try_undo_loss(sk, true))
    2782             :                         return;
    2783             : 
    2784           0 :                 if (after(tp->snd_nxt, tp->high_seq)) {
    2785           0 :                         if (flag & FLAG_DATA_SACKED || num_dupack)
    2786           0 :                                 tp->frto = 0; /* Step 3.a. loss was real */
    2787           0 :                 } else if (flag & FLAG_SND_UNA_ADVANCED && !recovered) {
    2788           0 :                         tp->high_seq = tp->snd_nxt;
    2789             :                         /* Step 2.b. Try send new data (but deferred until cwnd
    2790             :                          * is updated in tcp_ack()). Otherwise fall back to
    2791             :                          * the conventional recovery.
    2792             :                          */
    2793           0 :                         if (!tcp_write_queue_empty(sk) &&
    2794           0 :                             after(tcp_wnd_end(tp), tp->snd_nxt)) {
    2795           0 :                                 *rexmit = REXMIT_NEW;
    2796           0 :                                 return;
    2797             :                         }
    2798           0 :                         tp->frto = 0;
    2799             :                 }
    2800             :         }
    2801             : 
    2802           0 :         if (recovered) {
    2803             :                 /* F-RTO RFC5682 sec 3.1 step 2.a and 1st part of step 3.a */
    2804           0 :                 tcp_try_undo_recovery(sk);
    2805           0 :                 return;
    2806             :         }
    2807           0 :         if (tcp_is_reno(tp)) {
    2808             :                 /* A Reno DUPACK means new data in F-RTO step 2.b above are
    2809             :                  * delivered. Lower inflight to clock out (re)tranmissions.
    2810             :                  */
    2811           0 :                 if (after(tp->snd_nxt, tp->high_seq) && num_dupack)
    2812           0 :                         tcp_add_reno_sack(sk, num_dupack, flag & FLAG_ECE);
    2813           0 :                 else if (flag & FLAG_SND_UNA_ADVANCED)
    2814           0 :                         tcp_reset_reno_sack(tp);
    2815             :         }
    2816           0 :         *rexmit = REXMIT_LOST;
    2817             : }
    2818             : 
    2819             : /* Undo during fast recovery after partial ACK. */
    2820           0 : static bool tcp_try_undo_partial(struct sock *sk, u32 prior_snd_una)
    2821             : {
    2822           0 :         struct tcp_sock *tp = tcp_sk(sk);
    2823             : 
    2824           0 :         if (tp->undo_marker && tcp_packet_delayed(tp)) {
    2825             :                 /* Plain luck! Hole if filled with delayed
    2826             :                  * packet, rather than with a retransmit. Check reordering.
    2827             :                  */
    2828           0 :                 tcp_check_sack_reordering(sk, prior_snd_una, 1);
    2829             : 
    2830             :                 /* We are getting evidence that the reordering degree is higher
    2831             :                  * than we realized. If there are no retransmits out then we
    2832             :                  * can undo. Otherwise we clock out new packets but do not
    2833             :                  * mark more packets lost or retransmit more.
    2834             :                  */
    2835           0 :                 if (tp->retrans_out)
    2836             :                         return true;
    2837             : 
    2838           0 :                 if (!tcp_any_retrans_done(sk))
    2839           0 :                         tp->retrans_stamp = 0;
    2840             : 
    2841           0 :                 DBGUNDO(sk, "partial recovery");
    2842           0 :                 tcp_undo_cwnd_reduction(sk, true);
    2843           0 :                 NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPPARTIALUNDO);
    2844           0 :                 tcp_try_keep_open(sk);
    2845           0 :                 return true;
    2846             :         }
    2847             :         return false;
    2848             : }
    2849             : 
    2850           0 : static void tcp_identify_packet_loss(struct sock *sk, int *ack_flag)
    2851             : {
    2852           0 :         struct tcp_sock *tp = tcp_sk(sk);
    2853             : 
    2854           0 :         if (tcp_rtx_queue_empty(sk))
    2855             :                 return;
    2856             : 
    2857           0 :         if (unlikely(tcp_is_reno(tp))) {
    2858           0 :                 tcp_newreno_mark_lost(sk, *ack_flag & FLAG_SND_UNA_ADVANCED);
    2859           0 :         } else if (tcp_is_rack(sk)) {
    2860           0 :                 u32 prior_retrans = tp->retrans_out;
    2861             : 
    2862           0 :                 if (tcp_rack_mark_lost(sk))
    2863           0 :                         *ack_flag &= ~FLAG_SET_XMIT_TIMER;
    2864           0 :                 if (prior_retrans > tp->retrans_out)
    2865           0 :                         *ack_flag |= FLAG_LOST_RETRANS;
    2866             :         }
    2867             : }
    2868             : 
    2869           0 : static bool tcp_force_fast_retransmit(struct sock *sk)
    2870             : {
    2871           0 :         struct tcp_sock *tp = tcp_sk(sk);
    2872             : 
    2873           0 :         return after(tcp_highest_sack_seq(tp),
    2874             :                      tp->snd_una + tp->reordering * tp->mss_cache);
    2875             : }
    2876             : 
    2877             : /* Process an event, which can update packets-in-flight not trivially.
    2878             :  * Main goal of this function is to calculate new estimate for left_out,
    2879             :  * taking into account both packets sitting in receiver's buffer and
    2880             :  * packets lost by network.
    2881             :  *
    2882             :  * Besides that it updates the congestion state when packet loss or ECN
    2883             :  * is detected. But it does not reduce the cwnd, it is done by the
    2884             :  * congestion control later.
    2885             :  *
    2886             :  * It does _not_ decide what to send, it is made in function
    2887             :  * tcp_xmit_retransmit_queue().
    2888             :  */
    2889           0 : static void tcp_fastretrans_alert(struct sock *sk, const u32 prior_snd_una,
    2890             :                                   int num_dupack, int *ack_flag, int *rexmit)
    2891             : {
    2892           0 :         struct inet_connection_sock *icsk = inet_csk(sk);
    2893           0 :         struct tcp_sock *tp = tcp_sk(sk);
    2894           0 :         int fast_rexmit = 0, flag = *ack_flag;
    2895           0 :         bool ece_ack = flag & FLAG_ECE;
    2896           0 :         bool do_lost = num_dupack || ((flag & FLAG_DATA_SACKED) &&
    2897           0 :                                       tcp_force_fast_retransmit(sk));
    2898             : 
    2899           0 :         if (!tp->packets_out && tp->sacked_out)
    2900           0 :                 tp->sacked_out = 0;
    2901             : 
    2902             :         /* Now state machine starts.
    2903             :          * A. ECE, hence prohibit cwnd undoing, the reduction is required. */
    2904           0 :         if (ece_ack)
    2905           0 :                 tp->prior_ssthresh = 0;
    2906             : 
    2907             :         /* B. In all the states check for reneging SACKs. */
    2908           0 :         if (tcp_check_sack_reneging(sk, flag))
    2909             :                 return;
    2910             : 
    2911             :         /* C. Check consistency of the current state. */
    2912           0 :         tcp_verify_left_out(tp);
    2913             : 
    2914             :         /* D. Check state exit conditions. State can be terminated
    2915             :          *    when high_seq is ACKed. */
    2916           0 :         if (icsk->icsk_ca_state == TCP_CA_Open) {
    2917           0 :                 WARN_ON(tp->retrans_out != 0);
    2918           0 :                 tp->retrans_stamp = 0;
    2919           0 :         } else if (!before(tp->snd_una, tp->high_seq)) {
    2920           0 :                 switch (icsk->icsk_ca_state) {
    2921           0 :                 case TCP_CA_CWR:
    2922             :                         /* CWR is to be held something *above* high_seq
    2923             :                          * is ACKed for CWR bit to reach receiver. */
    2924           0 :                         if (tp->snd_una != tp->high_seq) {
    2925           0 :                                 tcp_end_cwnd_reduction(sk);
    2926           0 :                                 tcp_set_ca_state(sk, TCP_CA_Open);
    2927             :                         }
    2928             :                         break;
    2929             : 
    2930             :                 case TCP_CA_Recovery:
    2931           0 :                         if (tcp_is_reno(tp))
    2932           0 :                                 tcp_reset_reno_sack(tp);
    2933           0 :                         if (tcp_try_undo_recovery(sk))
    2934             :                                 return;
    2935           0 :                         tcp_end_cwnd_reduction(sk);
    2936           0 :                         break;
    2937             :                 }
    2938           0 :         }
    2939             : 
    2940             :         /* E. Process state. */
    2941           0 :         switch (icsk->icsk_ca_state) {
    2942           0 :         case TCP_CA_Recovery:
    2943           0 :                 if (!(flag & FLAG_SND_UNA_ADVANCED)) {
    2944           0 :                         if (tcp_is_reno(tp))
    2945           0 :                                 tcp_add_reno_sack(sk, num_dupack, ece_ack);
    2946             :                 } else {
    2947           0 :                         if (tcp_try_undo_partial(sk, prior_snd_una))
    2948             :                                 return;
    2949             :                         /* Partial ACK arrived. Force fast retransmit. */
    2950           0 :                         do_lost = tcp_force_fast_retransmit(sk);
    2951             :                 }
    2952           0 :                 if (tcp_try_undo_dsack(sk)) {
    2953           0 :                         tcp_try_keep_open(sk);
    2954           0 :                         return;
    2955             :                 }
    2956           0 :                 tcp_identify_packet_loss(sk, ack_flag);
    2957           0 :                 break;
    2958           0 :         case TCP_CA_Loss:
    2959           0 :                 tcp_process_loss(sk, flag, num_dupack, rexmit);
    2960           0 :                 tcp_identify_packet_loss(sk, ack_flag);
    2961           0 :                 if (!(icsk->icsk_ca_state == TCP_CA_Open ||
    2962           0 :                       (*ack_flag & FLAG_LOST_RETRANS)))
    2963             :                         return;
    2964             :                 /* Change state if cwnd is undone or retransmits are lost */
    2965           0 :                 fallthrough;
    2966             :         default:
    2967           0 :                 if (tcp_is_reno(tp)) {
    2968           0 :                         if (flag & FLAG_SND_UNA_ADVANCED)
    2969           0 :                                 tcp_reset_reno_sack(tp);
    2970           0 :                         tcp_add_reno_sack(sk, num_dupack, ece_ack);
    2971             :                 }
    2972             : 
    2973           0 :                 if (icsk->icsk_ca_state <= TCP_CA_Disorder)
    2974           0 :                         tcp_try_undo_dsack(sk);
    2975             : 
    2976           0 :                 tcp_identify_packet_loss(sk, ack_flag);
    2977           0 :                 if (!tcp_time_to_recover(sk, flag)) {
    2978           0 :                         tcp_try_to_open(sk, flag);
    2979           0 :                         return;
    2980             :                 }
    2981             : 
    2982             :                 /* MTU probe failure: don't reduce cwnd */
    2983           0 :                 if (icsk->icsk_ca_state < TCP_CA_CWR &&
    2984           0 :                     icsk->icsk_mtup.probe_size &&
    2985           0 :                     tp->snd_una == tp->mtu_probe.probe_seq_start) {
    2986           0 :                         tcp_mtup_probe_failed(sk);
    2987             :                         /* Restores the reduction we did in tcp_mtup_probe() */
    2988           0 :                         tp->snd_cwnd++;
    2989           0 :                         tcp_simple_retransmit(sk);
    2990           0 :                         return;
    2991             :                 }
    2992             : 
    2993             :                 /* Otherwise enter Recovery state */
    2994           0 :                 tcp_enter_recovery(sk, ece_ack);
    2995           0 :                 fast_rexmit = 1;
    2996             :         }
    2997             : 
    2998           0 :         if (!tcp_is_rack(sk) && do_lost)
    2999           0 :                 tcp_update_scoreboard(sk, fast_rexmit);
    3000           0 :         *rexmit = REXMIT_LOST;
    3001             : }
    3002             : 
    3003         355 : static void tcp_update_rtt_min(struct sock *sk, u32 rtt_us, const int flag)
    3004             : {
    3005         355 :         u32 wlen = sock_net(sk)->ipv4.sysctl_tcp_min_rtt_wlen * HZ;
    3006         355 :         struct tcp_sock *tp = tcp_sk(sk);
    3007             : 
    3008         355 :         if ((flag & FLAG_ACK_MAYBE_DELAYED) && rtt_us > tcp_min_rtt(tp)) {
    3009             :                 /* If the remote keeps returning delayed ACKs, eventually
    3010             :                  * the min filter would pick it up and overestimate the
    3011             :                  * prop. delay when it expires. Skip suspected delayed ACKs.
    3012             :                  */
    3013             :                 return;
    3014             :         }
    3015         257 :         minmax_running_min(&tp->rtt_min, wlen, tcp_jiffies32,
    3016           0 :                            rtt_us ? : jiffies_to_usecs(1));
    3017             : }
    3018             : 
    3019         355 : static bool tcp_ack_update_rtt(struct sock *sk, const int flag,
    3020             :                                long seq_rtt_us, long sack_rtt_us,
    3021             :                                long ca_rtt_us, struct rate_sample *rs)
    3022             : {
    3023         355 :         const struct tcp_sock *tp = tcp_sk(sk);
    3024             : 
    3025             :         /* Prefer RTT measured from ACK's timing to TS-ECR. This is because
    3026             :          * broken middle-boxes or peers may corrupt TS-ECR fields. But
    3027             :          * Karn's algorithm forbids taking RTT if some retransmitted data
    3028             :          * is acked (RFC6298).
    3029             :          */
    3030         355 :         if (seq_rtt_us < 0)
    3031           0 :                 seq_rtt_us = sack_rtt_us;
    3032             : 
    3033             :         /* RTTM Rule: A TSecr value received in a segment is used to
    3034             :          * update the averaged RTT measurement only if the segment
    3035             :          * acknowledges some new data, i.e., only if it advances the
    3036             :          * left edge of the send window.
    3037             :          * See draft-ietf-tcplw-high-performance-00, section 3.3.
    3038             :          */
    3039         355 :         if (seq_rtt_us < 0 && tp->rx_opt.saw_tstamp && tp->rx_opt.rcv_tsecr &&
    3040           0 :             flag & FLAG_ACKED) {
    3041           0 :                 u32 delta = tcp_time_stamp(tp) - tp->rx_opt.rcv_tsecr;
    3042             : 
    3043           0 :                 if (likely(delta < INT_MAX / (USEC_PER_SEC / TCP_TS_HZ))) {
    3044           0 :                         if (!delta)
    3045             :                                 delta = 1;
    3046           0 :                         seq_rtt_us = delta * (USEC_PER_SEC / TCP_TS_HZ);
    3047           0 :                         ca_rtt_us = seq_rtt_us;
    3048             :                 }
    3049             :         }
    3050         355 :         rs->rtt_us = ca_rtt_us; /* RTT of last (S)ACKed packet (or -1) */
    3051         355 :         if (seq_rtt_us < 0)
    3052             :                 return false;
    3053             : 
    3054             :         /* ca_rtt_us >= 0 is counting on the invariant that ca_rtt_us is
    3055             :          * always taken together with ACK, SACK, or TS-opts. Any negative
    3056             :          * values will be skipped with the seq_rtt_us < 0 check above.
    3057             :          */
    3058         355 :         tcp_update_rtt_min(sk, ca_rtt_us, flag);
    3059         355 :         tcp_rtt_estimator(sk, seq_rtt_us);
    3060         355 :         tcp_set_rto(sk);
    3061             : 
    3062             :         /* RFC6298: only reset backoff on valid RTT measurement. */
    3063         355 :         inet_csk(sk)->icsk_backoff = 0;
    3064         355 :         return true;
    3065             : }
    3066             : 
    3067             : /* Compute time elapsed between (last) SYNACK and the ACK completing 3WHS. */
    3068           4 : void tcp_synack_rtt_meas(struct sock *sk, struct request_sock *req)
    3069             : {
    3070           4 :         struct rate_sample rs;
    3071           4 :         long rtt_us = -1L;
    3072             : 
    3073           4 :         if (req && !req->num_retrans && tcp_rsk(req)->snt_synack)
    3074           4 :                 rtt_us = tcp_stamp_us_delta(tcp_clock_us(), tcp_rsk(req)->snt_synack);
    3075             : 
    3076           4 :         tcp_ack_update_rtt(sk, FLAG_SYN_ACKED, rtt_us, -1L, rtt_us, &rs);
    3077           4 : }
    3078             : 
    3079             : 
    3080         351 : static void tcp_cong_avoid(struct sock *sk, u32 ack, u32 acked)
    3081             : {
    3082         351 :         const struct inet_connection_sock *icsk = inet_csk(sk);
    3083             : 
    3084         351 :         icsk->icsk_ca_ops->cong_avoid(sk, ack, acked);
    3085         351 :         tcp_sk(sk)->snd_cwnd_stamp = tcp_jiffies32;
    3086         351 : }
    3087             : 
    3088             : /* Restart timer after forward progress on connection.
    3089             :  * RFC2988 recommends to restart timer to now+rto.
    3090             :  */
    3091         585 : void tcp_rearm_rto(struct sock *sk)
    3092             : {
    3093         585 :         const struct inet_connection_sock *icsk = inet_csk(sk);
    3094         585 :         struct tcp_sock *tp = tcp_sk(sk);
    3095             : 
    3096             :         /* If the retrans timer is currently being used by Fast Open
    3097             :          * for SYN-ACK retrans purpose, stay put.
    3098             :          */
    3099         585 :         if (rcu_access_pointer(tp->fastopen_rsk))
    3100             :                 return;
    3101             : 
    3102         585 :         if (!tp->packets_out) {
    3103         234 :                 inet_csk_clear_xmit_timer(sk, ICSK_TIME_RETRANS);
    3104             :         } else {
    3105         351 :                 u32 rto = inet_csk(sk)->icsk_rto;
    3106             :                 /* Offset the time elapsed after installing regular RTO */
    3107         351 :                 if (icsk->icsk_pending == ICSK_TIME_REO_TIMEOUT ||
    3108             :                     icsk->icsk_pending == ICSK_TIME_LOSS_PROBE) {
    3109           0 :                         s64 delta_us = tcp_rto_delta_us(sk);
    3110             :                         /* delta_us may not be positive if the socket is locked
    3111             :                          * when the retrans timer fires and is rescheduled.
    3112             :                          */
    3113           0 :                         rto = usecs_to_jiffies(max_t(int, delta_us, 1));
    3114             :                 }
    3115         351 :                 tcp_reset_xmit_timer(sk, ICSK_TIME_RETRANS, rto,
    3116             :                                      TCP_RTO_MAX);
    3117             :         }
    3118             : }
    3119             : 
    3120             : /* Try to schedule a loss probe; if that doesn't work, then schedule an RTO. */
    3121         351 : static void tcp_set_xmit_timer(struct sock *sk)
    3122             : {
    3123         351 :         if (!tcp_schedule_loss_probe(sk, true))
    3124         351 :                 tcp_rearm_rto(sk);
    3125         351 : }
    3126             : 
    3127             : /* If we get here, the whole TSO packet has not been acked. */
    3128           0 : static u32 tcp_tso_acked(struct sock *sk, struct sk_buff *skb)
    3129             : {
    3130           0 :         struct tcp_sock *tp = tcp_sk(sk);
    3131           0 :         u32 packets_acked;
    3132             : 
    3133           0 :         BUG_ON(!after(TCP_SKB_CB(skb)->end_seq, tp->snd_una));
    3134             : 
    3135           0 :         packets_acked = tcp_skb_pcount(skb);
    3136           0 :         if (tcp_trim_head(sk, skb, tp->snd_una - TCP_SKB_CB(skb)->seq))
    3137             :                 return 0;
    3138           0 :         packets_acked -= tcp_skb_pcount(skb);
    3139             : 
    3140           0 :         if (packets_acked) {
    3141           0 :                 BUG_ON(tcp_skb_pcount(skb) == 0);
    3142           0 :                 BUG_ON(!before(TCP_SKB_CB(skb)->seq, TCP_SKB_CB(skb)->end_seq));
    3143             :         }
    3144             : 
    3145             :         return packets_acked;
    3146             : }
    3147             : 
    3148         481 : static void tcp_ack_tstamp(struct sock *sk, struct sk_buff *skb,
    3149             :                            const struct sk_buff *ack_skb, u32 prior_snd_una)
    3150             : {
    3151         481 :         const struct skb_shared_info *shinfo;
    3152             : 
    3153             :         /* Avoid cache line misses to get skb_shinfo() and shinfo->tx_flags */
    3154         481 :         if (likely(!TCP_SKB_CB(skb)->txstamp_ack))
    3155             :                 return;
    3156             : 
    3157           0 :         shinfo = skb_shinfo(skb);
    3158           0 :         if (!before(shinfo->tskey, prior_snd_una) &&
    3159           0 :             before(shinfo->tskey, tcp_sk(sk)->snd_una)) {
    3160           0 :                 tcp_skb_tsorted_save(skb) {
    3161           0 :                         __skb_tstamp_tx(skb, ack_skb, NULL, sk, SCM_TSTAMP_ACK);
    3162           0 :                 } tcp_skb_tsorted_restore(skb);
    3163             :         }
    3164             : }
    3165             : 
    3166             : /* Remove acknowledged frames from the retransmission queue. If our packet
    3167             :  * is before the ack sequence we can discard it as it's confirmed to have
    3168             :  * arrived at the other end.
    3169             :  */
    3170         351 : static int tcp_clean_rtx_queue(struct sock *sk, const struct sk_buff *ack_skb,
    3171             :                                u32 prior_fack, u32 prior_snd_una,
    3172             :                                struct tcp_sacktag_state *sack, bool ece_ack)
    3173             : {
    3174         351 :         const struct inet_connection_sock *icsk = inet_csk(sk);
    3175         351 :         u64 first_ackt, last_ackt;
    3176         351 :         struct tcp_sock *tp = tcp_sk(sk);
    3177         351 :         u32 prior_sacked = tp->sacked_out;
    3178         351 :         u32 reord = tp->snd_nxt; /* lowest acked un-retx un-sacked seq */
    3179         351 :         struct sk_buff *skb, *next;
    3180         351 :         bool fully_acked = true;
    3181         351 :         long sack_rtt_us = -1L;
    3182         351 :         long seq_rtt_us = -1L;
    3183         351 :         long ca_rtt_us = -1L;
    3184         351 :         u32 pkts_acked = 0;
    3185         351 :         u32 last_in_flight = 0;
    3186         351 :         bool rtt_update;
    3187         351 :         int flag = 0;
    3188             : 
    3189         351 :         first_ackt = 0;
    3190             : 
    3191        1066 :         for (skb = skb_rb_first(&sk->tcp_rtx_queue); skb; skb = next) {
    3192         481 :                 struct tcp_skb_cb *scb = TCP_SKB_CB(skb);
    3193         481 :                 const u32 start_seq = scb->seq;
    3194         481 :                 u8 sacked = scb->sacked;
    3195         481 :                 u32 acked_pcount;
    3196             : 
    3197             :                 /* Determine how many packets and what bytes were acked, tso and else */
    3198         481 :                 if (after(scb->end_seq, tp->snd_una)) {
    3199         117 :                         if (tcp_skb_pcount(skb) == 1 ||
    3200           0 :                             !after(tp->snd_una, scb->seq))
    3201             :                                 break;
    3202             : 
    3203           0 :                         acked_pcount = tcp_tso_acked(sk, skb);
    3204           0 :                         if (!acked_pcount)
    3205             :                                 break;
    3206             :                         fully_acked = false;
    3207             :                 } else {
    3208         364 :                         acked_pcount = tcp_skb_pcount(skb);
    3209             :                 }
    3210             : 
    3211         364 :                 if (unlikely(sacked & TCPCB_RETRANS)) {
    3212           0 :                         if (sacked & TCPCB_SACKED_RETRANS)
    3213           0 :                                 tp->retrans_out -= acked_pcount;
    3214           0 :                         flag |= FLAG_RETRANS_DATA_ACKED;
    3215         364 :                 } else if (!(sacked & TCPCB_SACKED_ACKED)) {
    3216         364 :                         last_ackt = tcp_skb_timestamp_us(skb);
    3217         364 :                         WARN_ON_ONCE(last_ackt == 0);
    3218         364 :                         if (!first_ackt)
    3219         351 :                                 first_ackt = last_ackt;
    3220             : 
    3221         364 :                         last_in_flight = TCP_SKB_CB(skb)->tx.in_flight;
    3222         364 :                         if (before(start_seq, reord))
    3223         351 :                                 reord = start_seq;
    3224         364 :                         if (!after(scb->end_seq, tp->high_seq))
    3225          12 :                                 flag |= FLAG_ORIG_SACK_ACKED;
    3226             :                 }
    3227             : 
    3228         364 :                 if (sacked & TCPCB_SACKED_ACKED) {
    3229           0 :                         tp->sacked_out -= acked_pcount;
    3230         364 :                 } else if (tcp_is_sack(tp)) {
    3231           0 :                         tcp_count_delivered(tp, acked_pcount, ece_ack);
    3232           0 :                         if (!tcp_skb_spurious_retrans(tp, skb))
    3233           0 :                                 tcp_rack_advance(tp, sacked, scb->end_seq,
    3234             :                                                  tcp_skb_timestamp_us(skb));
    3235             :                 }
    3236         364 :                 if (sacked & TCPCB_LOST)
    3237           0 :                         tp->lost_out -= acked_pcount;
    3238             : 
    3239         364 :                 tp->packets_out -= acked_pcount;
    3240         364 :                 pkts_acked += acked_pcount;
    3241         364 :                 tcp_rate_skb_delivered(sk, skb, sack->rate);
    3242             : 
    3243             :                 /* Initial outgoing SYN's get put onto the write_queue
    3244             :                  * just like anything else we transmit.  It is not
    3245             :                  * true data, and if we misinform our callers that
    3246             :                  * this ACK acks real data, we will erroneously exit
    3247             :                  * connection startup slow start one packet too
    3248             :                  * quickly.  This is severely frowned upon behavior.
    3249             :                  */
    3250         364 :                 if (likely(!(scb->tcp_flags & TCPHDR_SYN))) {
    3251         364 :                         flag |= FLAG_DATA_ACKED;
    3252             :                 } else {
    3253           0 :                         flag |= FLAG_SYN_ACKED;
    3254           0 :                         tp->retrans_stamp = 0;
    3255             :                 }
    3256             : 
    3257         364 :                 if (!fully_acked)
    3258             :                         break;
    3259             : 
    3260         364 :                 tcp_ack_tstamp(sk, skb, ack_skb, prior_snd_una);
    3261             : 
    3262         364 :                 next = skb_rb_next(skb);
    3263         364 :                 if (unlikely(skb == tp->retransmit_skb_hint))
    3264           0 :                         tp->retransmit_skb_hint = NULL;
    3265         364 :                 if (unlikely(skb == tp->lost_skb_hint))
    3266           0 :                         tp->lost_skb_hint = NULL;
    3267         364 :                 tcp_highest_sack_replace(sk, skb, next);
    3268         364 :                 tcp_rtx_queue_unlink_and_free(skb, sk);
    3269             :         }
    3270             : 
    3271         351 :         if (!skb)
    3272         234 :                 tcp_chrono_stop(sk, TCP_CHRONO_BUSY);
    3273             : 
    3274         351 :         if (likely(between(tp->snd_up, prior_snd_una, tp->snd_una)))
    3275         351 :                 tp->snd_up = tp->snd_una;
    3276             : 
    3277         351 :         if (skb) {
    3278         117 :                 tcp_ack_tstamp(sk, skb, ack_skb, prior_snd_una);
    3279         117 :                 if (TCP_SKB_CB(skb)->sacked & TCPCB_SACKED_ACKED)
    3280           0 :                         flag |= FLAG_SACK_RENEGING;
    3281             :         }
    3282             : 
    3283         351 :         if (likely(first_ackt) && !(flag & FLAG_RETRANS_DATA_ACKED)) {
    3284         351 :                 seq_rtt_us = tcp_stamp_us_delta(tp->tcp_mstamp, first_ackt);
    3285         351 :                 ca_rtt_us = tcp_stamp_us_delta(tp->tcp_mstamp, last_ackt);
    3286             : 
    3287         351 :                 if (pkts_acked == 1 && last_in_flight < tp->mss_cache &&
    3288         340 :                     last_in_flight && !prior_sacked && fully_acked &&
    3289         340 :                     sack->rate->prior_delivered + 1 == tp->delivered &&
    3290          99 :                     !(flag & (FLAG_CA_ALERT | FLAG_SYN_ACKED))) {
    3291             :                         /* Conservatively mark a delayed ACK. It's typically
    3292             :                          * from a lone runt packet over the round trip to
    3293             :                          * a receiver w/o out-of-order or CE events.
    3294             :                          */
    3295          99 :                         flag |= FLAG_ACK_MAYBE_DELAYED;
    3296             :                 }
    3297             :         }
    3298         351 :         if (sack->first_sackt) {
    3299           0 :                 sack_rtt_us = tcp_stamp_us_delta(tp->tcp_mstamp, sack->first_sackt);
    3300           0 :                 ca_rtt_us = tcp_stamp_us_delta(tp->tcp_mstamp, sack->last_sackt);
    3301             :         }
    3302         351 :         rtt_update = tcp_ack_update_rtt(sk, flag, seq_rtt_us, sack_rtt_us,
    3303             :                                         ca_rtt_us, sack->rate);
    3304             : 
    3305         351 :         if (flag & FLAG_ACKED) {
    3306         351 :                 flag |= FLAG_SET_XMIT_TIMER;  /* set TLP or RTO timer */
    3307         351 :                 if (unlikely(icsk->icsk_mtup.probe_size &&
    3308             :                              !after(tp->mtu_probe.probe_seq_end, tp->snd_una))) {
    3309           0 :                         tcp_mtup_probe_success(sk);
    3310             :                 }
    3311             : 
    3312         351 :                 if (tcp_is_reno(tp)) {
    3313         351 :                         tcp_remove_reno_sacks(sk, pkts_acked, ece_ack);
    3314             : 
    3315             :                         /* If any of the cumulatively ACKed segments was
    3316             :                          * retransmitted, non-SACK case cannot confirm that
    3317             :                          * progress was due to original transmission due to
    3318             :                          * lack of TCPCB_SACKED_ACKED bits even if some of
    3319             :                          * the packets may have been never retransmitted.
    3320             :                          */
    3321         351 :                         if (flag & FLAG_RETRANS_DATA_ACKED)
    3322           0 :                                 flag &= ~FLAG_ORIG_SACK_ACKED;
    3323             :                 } else {
    3324           0 :                         int delta;
    3325             : 
    3326             :                         /* Non-retransmitted hole got filled? That's reordering */
    3327           0 :                         if (before(reord, prior_fack))
    3328           0 :                                 tcp_check_sack_reordering(sk, reord, 0);
    3329             : 
    3330           0 :                         delta = prior_sacked - tp->sacked_out;
    3331           0 :                         tp->lost_cnt_hint -= min(tp->lost_cnt_hint, delta);
    3332             :                 }
    3333           0 :         } else if (skb && rtt_update && sack_rtt_us >= 0 &&
    3334           0 :                    sack_rtt_us > tcp_stamp_us_delta(tp->tcp_mstamp,
    3335             :                                                     tcp_skb_timestamp_us(skb))) {
    3336             :                 /* Do not re-arm RTO if the sack RTT is measured from data sent
    3337             :                  * after when the head was last (re)transmitted. Otherwise the
    3338             :                  * timeout may continue to extend in loss recovery.
    3339             :                  */
    3340           0 :                 flag |= FLAG_SET_XMIT_TIMER;  /* set TLP or RTO timer */
    3341             :         }
    3342             : 
    3343         351 :         if (icsk->icsk_ca_ops->pkts_acked) {
    3344         351 :                 struct ack_sample sample = { .pkts_acked = pkts_acked,
    3345         351 :                                              .rtt_us = sack->rate->rtt_us,
    3346             :                                              .in_flight = last_in_flight };
    3347             : 
    3348         351 :                 icsk->icsk_ca_ops->pkts_acked(sk, &sample);
    3349             :         }
    3350             : 
    3351             : #if FASTRETRANS_DEBUG > 0
    3352         351 :         WARN_ON((int)tp->sacked_out < 0);
    3353         351 :         WARN_ON((int)tp->lost_out < 0);
    3354         351 :         WARN_ON((int)tp->retrans_out < 0);
    3355         351 :         if (!tp->packets_out && tcp_is_sack(tp)) {
    3356           0 :                 icsk = inet_csk(sk);
    3357           0 :                 if (tp->lost_out) {
    3358           0 :                         pr_debug("Leak l=%u %d\n",
    3359             :                                  tp->lost_out, icsk->icsk_ca_state);
    3360           0 :                         tp->lost_out = 0;
    3361             :                 }
    3362           0 :                 if (tp->sacked_out) {
    3363           0 :                         pr_debug("Leak s=%u %d\n",
    3364             :                                  tp->sacked_out, icsk->icsk_ca_state);
    3365           0 :                         tp->sacked_out = 0;
    3366             :                 }
    3367           0 :                 if (tp->retrans_out) {
    3368           0 :                         pr_debug("Leak r=%u %d\n",
    3369             :                                  tp->retrans_out, icsk->icsk_ca_state);
    3370           0 :                         tp->retrans_out = 0;
    3371             :                 }
    3372             :         }
    3373             : #endif
    3374         351 :         return flag;
    3375             : }
    3376             : 
    3377          19 : static void tcp_ack_probe(struct sock *sk)
    3378             : {
    3379          19 :         struct inet_connection_sock *icsk = inet_csk(sk);
    3380          19 :         struct sk_buff *head = tcp_send_head(sk);
    3381           0 :         const struct tcp_sock *tp = tcp_sk(sk);
    3382             : 
    3383             :         /* Was it a usable window open? */
    3384           0 :         if (!head)
    3385             :                 return;
    3386           0 :         if (!after(TCP_SKB_CB(head)->end_seq, tcp_wnd_end(tp))) {
    3387           0 :                 icsk->icsk_backoff = 0;
    3388           0 :                 icsk->icsk_probes_tstamp = 0;
    3389           0 :                 inet_csk_clear_xmit_timer(sk, ICSK_TIME_PROBE0);
    3390             :                 /* Socket must be waked up by subsequent tcp_data_snd_check().
    3391             :                  * This function is not for random using!
    3392             :                  */
    3393             :         } else {
    3394           0 :                 unsigned long when = tcp_probe0_when(sk, TCP_RTO_MAX);
    3395             : 
    3396           0 :                 when = tcp_clamp_probe0_to_user_timeout(sk, when);
    3397           0 :                 tcp_reset_xmit_timer(sk, ICSK_TIME_PROBE0, when, TCP_RTO_MAX);
    3398             :         }
    3399             : }
    3400             : 
    3401         351 : static inline bool tcp_ack_is_dubious(const struct sock *sk, const int flag)
    3402             : {
    3403         351 :         return !(flag & FLAG_NOT_DUP) || (flag & FLAG_CA_ALERT) ||
    3404         351 :                 inet_csk(sk)->icsk_ca_state != TCP_CA_Open;
    3405             : }
    3406             : 
    3407             : /* Decide wheather to run the increase function of congestion control. */
    3408         351 : static inline bool tcp_may_raise_cwnd(const struct sock *sk, const int flag)
    3409             : {
    3410             :         /* If reordering is high then always grow cwnd whenever data is
    3411             :          * delivered regardless of its ordering. Otherwise stay conservative
    3412             :          * and only grow cwnd on in-order delivery (RFC5681). A stretched ACK w/
    3413             :          * new SACK or ECE mark may first advance cwnd here and later reduce
    3414             :          * cwnd in tcp_fastretrans_alert() based on more states.
    3415             :          */
    3416         351 :         if (tcp_sk(sk)->reordering > sock_net(sk)->ipv4.sysctl_tcp_reordering)
    3417           0 :                 return flag & FLAG_FORWARD_PROGRESS;
    3418             : 
    3419         351 :         return flag & FLAG_DATA_ACKED;
    3420             : }
    3421             : 
    3422             : /* The "ultimate" congestion control function that aims to replace the rigid
    3423             :  * cwnd increase and decrease control (tcp_cong_avoid,tcp_*cwnd_reduction).
    3424             :  * It's called toward the end of processing an ACK with precise rate
    3425             :  * information. All transmission or retransmission are delayed afterwards.
    3426             :  */
    3427         351 : static void tcp_cong_control(struct sock *sk, u32 ack, u32 acked_sacked,
    3428             :                              int flag, const struct rate_sample *rs)
    3429             : {
    3430         351 :         const struct inet_connection_sock *icsk = inet_csk(sk);
    3431             : 
    3432         351 :         if (icsk->icsk_ca_ops->cong_control) {
    3433           0 :                 icsk->icsk_ca_ops->cong_control(sk, rs);
    3434           0 :                 return;
    3435             :         }
    3436             : 
    3437         351 :         if (tcp_in_cwnd_reduction(sk)) {
    3438             :                 /* Reduce cwnd if state mandates */
    3439           0 :                 tcp_cwnd_reduction(sk, acked_sacked, rs->losses, flag);
    3440         702 :         } else if (tcp_may_raise_cwnd(sk, flag)) {
    3441             :                 /* Advance cwnd if state allows */
    3442         351 :                 tcp_cong_avoid(sk, ack, acked_sacked);
    3443             :         }
    3444         351 :         tcp_update_pacing_rate(sk);
    3445             : }
    3446             : 
    3447             : /* Check that window update is acceptable.
    3448             :  * The function assumes that snd_una<=ack<=snd_next.
    3449             :  */
    3450          22 : static inline bool tcp_may_update_window(const struct tcp_sock *tp,
    3451             :                                         const u32 ack, const u32 ack_seq,
    3452             :                                         const u32 nwin)
    3453             : {
    3454          19 :         return  after(ack, tp->snd_una) ||
    3455          22 :                 after(ack_seq, tp->snd_wl1) ||
    3456           5 :                 (ack_seq == tp->snd_wl1 && nwin > tp->snd_wnd);
    3457             : }
    3458             : 
    3459             : /* If we update tp->snd_una, also update tp->bytes_acked */
    3460         370 : static void tcp_snd_una_update(struct tcp_sock *tp, u32 ack)
    3461             : {
    3462         370 :         u32 delta = ack - tp->snd_una;
    3463             : 
    3464         370 :         sock_owned_by_me((struct sock *)tp);
    3465         370 :         tp->bytes_acked += delta;
    3466         370 :         tp->snd_una = ack;
    3467             : }
    3468             : 
    3469             : /* If we update tp->rcv_nxt, also update tp->bytes_received */
    3470          70 : static void tcp_rcv_nxt_update(struct tcp_sock *tp, u32 seq)
    3471             : {
    3472          70 :         u32 delta = seq - tp->rcv_nxt;
    3473             : 
    3474          70 :         sock_owned_by_me((struct sock *)tp);
    3475          70 :         tp->bytes_received += delta;
    3476          70 :         WRITE_ONCE(tp->rcv_nxt, seq);
    3477             : }
    3478             : 
    3479             : /* Update our send window.
    3480             :  *
    3481             :  * Window update algorithm, described in RFC793/RFC1122 (used in linux-2.2
    3482             :  * and in FreeBSD. NetBSD's one is even worse.) is wrong.
    3483             :  */
    3484          22 : static int tcp_ack_update_window(struct sock *sk, const struct sk_buff *skb, u32 ack,
    3485             :                                  u32 ack_seq)
    3486             : {
    3487          22 :         struct tcp_sock *tp = tcp_sk(sk);
    3488          22 :         int flag = 0;
    3489          22 :         u32 nwin = ntohs(tcp_hdr(skb)->window);
    3490             : 
    3491          22 :         if (likely(!tcp_hdr(skb)->syn))
    3492          22 :                 nwin <<= tp->rx_opt.snd_wscale;
    3493             : 
    3494          41 :         if (tcp_may_update_window(tp, ack, ack_seq, nwin)) {
    3495          17 :                 flag |= FLAG_WIN_UPDATE;
    3496          17 :                 tcp_update_wl(tp, ack_seq);
    3497             : 
    3498          17 :                 if (tp->snd_wnd != nwin) {
    3499           0 :                         tp->snd_wnd = nwin;
    3500             : 
    3501             :                         /* Note, it is the only place, where
    3502             :                          * fast path is recovered for sending TCP.
    3503             :                          */
    3504           0 :                         tp->pred_flags = 0;
    3505           0 :                         tcp_fast_path_check(sk);
    3506             : 
    3507           0 :                         if (!tcp_write_queue_empty(sk))
    3508           0 :                                 tcp_slow_start_after_idle_check(sk);
    3509             : 
    3510           0 :                         if (nwin > tp->max_window) {
    3511           0 :                                 tp->max_window = nwin;
    3512           0 :                                 tcp_sync_mss(sk, inet_csk(sk)->icsk_pmtu_cookie);
    3513             :                         }
    3514             :                 }
    3515             :         }
    3516             : 
    3517          22 :         tcp_snd_una_update(tp, ack);
    3518             : 
    3519          22 :         return flag;
    3520             : }
    3521             : 
    3522           0 : static bool __tcp_oow_rate_limited(struct net *net, int mib_idx,
    3523             :                                    u32 *last_oow_ack_time)
    3524             : {
    3525           0 :         if (*last_oow_ack_time) {
    3526           0 :                 s32 elapsed = (s32)(tcp_jiffies32 - *last_oow_ack_time);
    3527             : 
    3528           0 :                 if (0 <= elapsed && elapsed < net->ipv4.sysctl_tcp_invalid_ratelimit) {
    3529           0 :                         NET_INC_STATS(net, mib_idx);
    3530           0 :                         return true;    /* rate-limited: don't send yet! */
    3531             :                 }
    3532             :         }
    3533             : 
    3534           0 :         *last_oow_ack_time = tcp_jiffies32;
    3535             : 
    3536           0 :         return false;   /* not rate-limited: go ahead, send dupack now! */
    3537             : }
    3538             : 
    3539             : /* Return true if we're currently rate-limiting out-of-window ACKs and
    3540             :  * thus shouldn't send a dupack right now. We rate-limit dupacks in
    3541             :  * response to out-of-window SYNs or ACKs to mitigate ACK loops or DoS
    3542             :  * attacks that send repeated SYNs or ACKs for the same connection. To
    3543             :  * do this, we do not send a duplicate SYNACK or ACK if the remote
    3544             :  * endpoint is sending out-of-window SYNs or pure ACKs at a high rate.
    3545             :  */
    3546           0 : bool tcp_oow_rate_limited(struct net *net, const struct sk_buff *skb,
    3547             :                           int mib_idx, u32 *last_oow_ack_time)
    3548             : {
    3549             :         /* Data packets without SYNs are not likely part of an ACK loop. */
    3550           0 :         if ((TCP_SKB_CB(skb)->seq != TCP_SKB_CB(skb)->end_seq) &&
    3551           0 :             !tcp_hdr(skb)->syn)
    3552             :                 return false;
    3553             : 
    3554           0 :         return __tcp_oow_rate_limited(net, mib_idx, last_oow_ack_time);
    3555             : }
    3556             : 
    3557             : /* RFC 5961 7 [ACK Throttling] */
    3558           0 : static void tcp_send_challenge_ack(struct sock *sk, const struct sk_buff *skb)
    3559             : {
    3560             :         /* unprotected vars, we dont care of overwrites */
    3561           0 :         static u32 challenge_timestamp;
    3562           0 :         static unsigned int challenge_count;
    3563           0 :         struct tcp_sock *tp = tcp_sk(sk);
    3564           0 :         struct net *net = sock_net(sk);
    3565           0 :         u32 count, now;
    3566             : 
    3567             :         /* First check our per-socket dupack rate limit. */
    3568           0 :         if (__tcp_oow_rate_limited(net,
    3569             :                                    LINUX_MIB_TCPACKSKIPPEDCHALLENGE,
    3570             :                                    &tp->last_oow_ack_time))
    3571             :                 return;
    3572             : 
    3573             :         /* Then check host-wide RFC 5961 rate limit. */
    3574           0 :         now = jiffies / HZ;
    3575           0 :         if (now != challenge_timestamp) {
    3576           0 :                 u32 ack_limit = net->ipv4.sysctl_tcp_challenge_ack_limit;
    3577           0 :                 u32 half = (ack_limit + 1) >> 1;
    3578             : 
    3579           0 :                 challenge_timestamp = now;
    3580           0 :                 WRITE_ONCE(challenge_count, half + prandom_u32_max(ack_limit));
    3581             :         }
    3582           0 :         count = READ_ONCE(challenge_count);
    3583           0 :         if (count > 0) {
    3584           0 :                 WRITE_ONCE(challenge_count, count - 1);
    3585           0 :                 NET_INC_STATS(net, LINUX_MIB_TCPCHALLENGEACK);
    3586           0 :                 tcp_send_ack(sk);
    3587             :         }
    3588             : }
    3589             : 
    3590           0 : static void tcp_store_ts_recent(struct tcp_sock *tp)
    3591             : {
    3592           0 :         tp->rx_opt.ts_recent = tp->rx_opt.rcv_tsval;
    3593           0 :         tp->rx_opt.ts_recent_stamp = ktime_get_seconds();
    3594           0 : }
    3595             : 
    3596          22 : static void tcp_replace_ts_recent(struct tcp_sock *tp, u32 seq)
    3597             : {
    3598          22 :         if (tp->rx_opt.saw_tstamp && !after(seq, tp->rcv_wup)) {
    3599             :                 /* PAWS bug workaround wrt. ACK frames, the PAWS discard
    3600             :                  * extra check below makes sure this can only happen
    3601             :                  * for pure ACK frames.  -DaveM
    3602             :                  *
    3603             :                  * Not only, also it occurs for expired timestamps.
    3604             :                  */
    3605             : 
    3606           0 :                 if (tcp_paws_check(&tp->rx_opt, 0))
    3607           0 :                         tcp_store_ts_recent(tp);
    3608             :         }
    3609          22 : }
    3610             : 
    3611             : /* This routine deals with acks during a TLP episode and ends an episode by
    3612             :  * resetting tlp_high_seq. Ref: TLP algorithm in draft-ietf-tcpm-rack
    3613             :  */
    3614           0 : static void tcp_process_tlp_ack(struct sock *sk, u32 ack, int flag)
    3615             : {
    3616           0 :         struct tcp_sock *tp = tcp_sk(sk);
    3617             : 
    3618           0 :         if (before(ack, tp->tlp_high_seq))
    3619             :                 return;
    3620             : 
    3621           0 :         if (!tp->tlp_retrans) {
    3622             :                 /* TLP of new data has been acknowledged */
    3623           0 :                 tp->tlp_high_seq = 0;
    3624           0 :         } else if (flag & FLAG_DSACKING_ACK) {
    3625             :                 /* This DSACK means original and TLP probe arrived; no loss */
    3626           0 :                 tp->tlp_high_seq = 0;
    3627           0 :         } else if (after(ack, tp->tlp_high_seq)) {
    3628             :                 /* ACK advances: there was a loss, so reduce cwnd. Reset
    3629             :                  * tlp_high_seq in tcp_init_cwnd_reduction()
    3630             :                  */
    3631           0 :                 tcp_init_cwnd_reduction(sk);
    3632           0 :                 tcp_set_ca_state(sk, TCP_CA_CWR);
    3633           0 :                 tcp_end_cwnd_reduction(sk);
    3634           0 :                 tcp_try_keep_open(sk);
    3635           0 :                 NET_INC_STATS(sock_net(sk),
    3636             :                                 LINUX_MIB_TCPLOSSPROBERECOVERY);
    3637           0 :         } else if (!(flag & (FLAG_SND_UNA_ADVANCED |
    3638             :                              FLAG_NOT_DUP | FLAG_DATA_SACKED))) {
    3639             :                 /* Pure dupack: original and TLP probe arrived; no loss */
    3640           0 :                 tp->tlp_high_seq = 0;
    3641             :         }
    3642             : }
    3643             : 
    3644         370 : static inline void tcp_in_ack_event(struct sock *sk, u32 flags)
    3645             : {
    3646         370 :         const struct inet_connection_sock *icsk = inet_csk(sk);
    3647             : 
    3648         370 :         if (icsk->icsk_ca_ops->in_ack_event)
    3649           0 :                 icsk->icsk_ca_ops->in_ack_event(sk, flags);
    3650             : }
    3651             : 
    3652             : /* Congestion control has updated the cwnd already. So if we're in
    3653             :  * loss recovery then now we do any new sends (for FRTO) or
    3654             :  * retransmits (for CA_Loss or CA_recovery) that make sense.
    3655             :  */
    3656         351 : static void tcp_xmit_recovery(struct sock *sk, int rexmit)
    3657             : {
    3658         351 :         struct tcp_sock *tp = tcp_sk(sk);
    3659             : 
    3660         351 :         if (rexmit == REXMIT_NONE || sk->sk_state == TCP_SYN_SENT)
    3661             :                 return;
    3662             : 
    3663           0 :         if (unlikely(rexmit == REXMIT_NEW)) {
    3664           0 :                 __tcp_push_pending_frames(sk, tcp_current_mss(sk),
    3665             :                                           TCP_NAGLE_OFF);
    3666           0 :                 if (after(tp->snd_nxt, tp->high_seq))
    3667             :                         return;
    3668           0 :                 tp->frto = 0;
    3669             :         }
    3670           0 :         tcp_xmit_retransmit_queue(sk);
    3671             : }
    3672             : 
    3673             : /* Returns the number of packets newly acked or sacked by the current ACK */
    3674         351 : static u32 tcp_newly_delivered(struct sock *sk, u32 prior_delivered, int flag)
    3675             : {
    3676         351 :         const struct net *net = sock_net(sk);
    3677         351 :         struct tcp_sock *tp = tcp_sk(sk);
    3678         351 :         u32 delivered;
    3679             : 
    3680         351 :         delivered = tp->delivered - prior_delivered;
    3681         351 :         NET_ADD_STATS(net, LINUX_MIB_TCPDELIVERED, delivered);
    3682         351 :         if (flag & FLAG_ECE)
    3683         351 :                 NET_ADD_STATS(net, LINUX_MIB_TCPDELIVEREDCE, delivered);
    3684             : 
    3685         351 :         return delivered;
    3686             : }
    3687             : 
    3688             : /* This routine deals with incoming acks, but not outgoing ones. */
    3689         370 : static int tcp_ack(struct sock *sk, const struct sk_buff *skb, int flag)
    3690             : {
    3691         370 :         struct inet_connection_sock *icsk = inet_csk(sk);
    3692         370 :         struct tcp_sock *tp = tcp_sk(sk);
    3693         370 :         struct tcp_sacktag_state sack_state;
    3694         370 :         struct rate_sample rs = { .prior_delivered = 0 };
    3695         370 :         u32 prior_snd_una = tp->snd_una;
    3696         370 :         bool is_sack_reneg = tp->is_sack_reneg;
    3697         370 :         u32 ack_seq = TCP_SKB_CB(skb)->seq;
    3698         370 :         u32 ack = TCP_SKB_CB(skb)->ack_seq;
    3699         370 :         int num_dupack = 0;
    3700         370 :         int prior_packets = tp->packets_out;
    3701         370 :         u32 delivered = tp->delivered;
    3702         370 :         u32 lost = tp->lost;
    3703         370 :         int rexmit = REXMIT_NONE; /* Flag to (re)transmit to recover losses */
    3704         370 :         u32 prior_fack;
    3705             : 
    3706         370 :         sack_state.first_sackt = 0;
    3707         370 :         sack_state.rate = &rs;
    3708         370 :         sack_state.sack_delivered = 0;
    3709             : 
    3710             :         /* We very likely will need to access rtx queue. */
    3711         370 :         prefetch(sk->tcp_rtx_queue.rb_node);
    3712             : 
    3713             :         /* If the ack is older than previous acks
    3714             :          * then we can probably ignore it.
    3715             :          */
    3716         370 :         if (before(ack, prior_snd_una)) {
    3717             :                 /* RFC 5961 5.2 [Blind Data Injection Attack].[Mitigation] */
    3718           0 :                 if (before(ack, prior_snd_una - tp->max_window)) {
    3719           0 :                         if (!(flag & FLAG_NO_CHALLENGE_ACK))
    3720           0 :                                 tcp_send_challenge_ack(sk, skb);
    3721           0 :                         return -1;
    3722             :                 }
    3723           0 :                 goto old_ack;
    3724             :         }
    3725             : 
    3726             :         /* If the ack includes data we haven't sent yet, discard
    3727             :          * this segment (RFC793 Section 3.9).
    3728             :          */
    3729         370 :         if (after(ack, tp->snd_nxt))
    3730             :                 return -1;
    3731             : 
    3732         370 :         if (after(ack, prior_snd_una)) {
    3733         351 :                 flag |= FLAG_SND_UNA_ADVANCED;
    3734         351 :                 icsk->icsk_retransmits = 0;
    3735             : 
    3736             : #if IS_ENABLED(CONFIG_TLS_DEVICE)
    3737             :                 if (static_branch_unlikely(&clean_acked_data_enabled.key))
    3738             :                         if (icsk->icsk_clean_acked)
    3739             :                                 icsk->icsk_clean_acked(sk, ack);
    3740             : #endif
    3741             :         }
    3742             : 
    3743         370 :         prior_fack = tcp_is_sack(tp) ? tcp_highest_sack_seq(tp) : tp->snd_una;
    3744         370 :         rs.prior_in_flight = tcp_packets_in_flight(tp);
    3745             : 
    3746             :         /* ts_recent update must be made after we are sure that the packet
    3747             :          * is in window.
    3748             :          */
    3749         370 :         if (flag & FLAG_UPDATE_TS_RECENT)
    3750          22 :                 tcp_replace_ts_recent(tp, TCP_SKB_CB(skb)->seq);
    3751             : 
    3752         370 :         if ((flag & (FLAG_SLOWPATH | FLAG_SND_UNA_ADVANCED)) ==
    3753             :             FLAG_SND_UNA_ADVANCED) {
    3754             :                 /* Window is constant, pure forward advance.
    3755             :                  * No more checks are required.
    3756             :                  * Note, we use the fact that SND.UNA>=SND.WL2.
    3757             :                  */
    3758         348 :                 tcp_update_wl(tp, ack_seq);
    3759         348 :                 tcp_snd_una_update(tp, ack);
    3760         348 :                 flag |= FLAG_WIN_UPDATE;
    3761             : 
    3762         348 :                 tcp_in_ack_event(sk, CA_ACK_WIN_UPDATE);
    3763             : 
    3764         348 :                 NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPHPACKS);
    3765             :         } else {
    3766          22 :                 u32 ack_ev_flags = CA_ACK_SLOWPATH;
    3767             : 
    3768          22 :                 if (ack_seq != TCP_SKB_CB(skb)->end_seq)
    3769          15 :                         flag |= FLAG_DATA;
    3770             :                 else
    3771           7 :                         NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPPUREACKS);
    3772             : 
    3773          22 :                 flag |= tcp_ack_update_window(sk, skb, ack, ack_seq);
    3774             : 
    3775          22 :                 if (TCP_SKB_CB(skb)->sacked)
    3776           0 :                         flag |= tcp_sacktag_write_queue(sk, skb, prior_snd_una,
    3777             :                                                         &sack_state);
    3778             : 
    3779          22 :                 if (tcp_ecn_rcv_ecn_echo(tp, tcp_hdr(skb))) {
    3780           0 :                         flag |= FLAG_ECE;
    3781           0 :                         ack_ev_flags |= CA_ACK_ECE;
    3782             :                 }
    3783             : 
    3784          22 :                 if (sack_state.sack_delivered)
    3785          22 :                         tcp_count_delivered(tp, sack_state.sack_delivered,
    3786           0 :                                             flag & FLAG_ECE);
    3787             : 
    3788          22 :                 if (flag & FLAG_WIN_UPDATE)
    3789          17 :                         ack_ev_flags |= CA_ACK_WIN_UPDATE;
    3790             : 
    3791          22 :                 tcp_in_ack_event(sk, ack_ev_flags);
    3792             :         }
    3793             : 
    3794             :         /* This is a deviation from RFC3168 since it states that:
    3795             :          * "When the TCP data sender is ready to set the CWR bit after reducing
    3796             :          * the congestion window, it SHOULD set the CWR bit only on the first
    3797             :          * new data packet that it transmits."
    3798             :          * We accept CWR on pure ACKs to be more robust
    3799             :          * with widely-deployed TCP implementations that do this.
    3800             :          */
    3801         370 :         tcp_ecn_accept_cwr(sk, skb);
    3802             : 
    3803             :         /* We passed data and got it acked, remove any soft error
    3804             :          * log. Something worked...
    3805             :          */
    3806         370 :         sk->sk_err_soft = 0;
    3807         370 :         icsk->icsk_probes_out = 0;
    3808         370 :         tp->rcv_tstamp = tcp_jiffies32;
    3809         370 :         if (!prior_packets)
    3810          19 :                 goto no_queue;
    3811             : 
    3812             :         /* See if we can take anything off of the retransmit queue. */
    3813         702 :         flag |= tcp_clean_rtx_queue(sk, skb, prior_fack, prior_snd_una,
    3814         351 :                                     &sack_state, flag & FLAG_ECE);
    3815             : 
    3816         351 :         tcp_rack_update_reo_wnd(sk, &rs);
    3817             : 
    3818         351 :         if (tp->tlp_high_seq)
    3819           0 :                 tcp_process_tlp_ack(sk, ack, flag);
    3820             : 
    3821         702 :         if (tcp_ack_is_dubious(sk, flag)) {
    3822           0 :                 if (!(flag & (FLAG_SND_UNA_ADVANCED | FLAG_NOT_DUP))) {
    3823           0 :                         num_dupack = 1;
    3824             :                         /* Consider if pure acks were aggregated in tcp_add_backlog() */
    3825           0 :                         if (!(flag & FLAG_DATA))
    3826           0 :                                 num_dupack = max_t(u16, 1, skb_shinfo(skb)->gso_segs);
    3827             :                 }
    3828           0 :                 tcp_fastretrans_alert(sk, prior_snd_una, num_dupack, &flag,
    3829             :                                       &rexmit);
    3830             :         }
    3831             : 
    3832             :         /* If needed, reset TLP/RTO timer when RACK doesn't set. */
    3833         351 :         if (flag & FLAG_SET_XMIT_TIMER)
    3834         351 :                 tcp_set_xmit_timer(sk);
    3835             : 
    3836         351 :         if ((flag & FLAG_FORWARD_PROGRESS) || !(flag & FLAG_NOT_DUP))
    3837         351 :                 sk_dst_confirm(sk);
    3838             : 
    3839         351 :         delivered = tcp_newly_delivered(sk, delivered, flag);
    3840         351 :         lost = tp->lost - lost;                      /* freshly marked lost */
    3841         351 :         rs.is_ack_delayed = !!(flag & FLAG_ACK_MAYBE_DELAYED);
    3842         351 :         tcp_rate_gen(sk, delivered, lost, is_sack_reneg, sack_state.rate);
    3843         351 :         tcp_cong_control(sk, ack, delivered, flag, sack_state.rate);
    3844         351 :         tcp_xmit_recovery(sk, rexmit);
    3845         351 :         return 1;
    3846             : 
    3847          19 : no_queue:
    3848             :         /* If data was DSACKed, see if we can undo a cwnd reduction. */
    3849          19 :         if (flag & FLAG_DSACKING_ACK) {
    3850           0 :                 tcp_fastretrans_alert(sk, prior_snd_una, num_dupack, &flag,
    3851             :                                       &rexmit);
    3852           0 :                 tcp_newly_delivered(sk, delivered, flag);
    3853             :         }
    3854             :         /* If this ack opens up a zero window, clear backoff.  It was
    3855             :          * being used to time the probes, and is probably far higher than
    3856             :          * it needs to be for normal retransmission.
    3857             :          */
    3858          19 :         tcp_ack_probe(sk);
    3859             : 
    3860          19 :         if (tp->tlp_high_seq)
    3861           0 :                 tcp_process_tlp_ack(sk, ack, flag);
    3862             :         return 1;
    3863             : 
    3864           0 : old_ack:
    3865             :         /* If data was SACKed, tag it and see if we should send more data.
    3866             :          * If data was DSACKed, see if we can undo a cwnd reduction.
    3867             :          */
    3868           0 :         if (TCP_SKB_CB(skb)->sacked) {
    3869           0 :                 flag |= tcp_sacktag_write_queue(sk, skb, prior_snd_una,
    3870             :                                                 &sack_state);
    3871           0 :                 tcp_fastretrans_alert(sk, prior_snd_una, num_dupack, &flag,
    3872             :                                       &rexmit);
    3873           0 :                 tcp_newly_delivered(sk, delivered, flag);
    3874           0 :                 tcp_xmit_recovery(sk, rexmit);
    3875             :         }
    3876             : 
    3877             :         return 0;
    3878             : }
    3879             : 
    3880           0 : static void tcp_parse_fastopen_option(int len, const unsigned char *cookie,
    3881             :                                       bool syn, struct tcp_fastopen_cookie *foc,
    3882             :                                       bool exp_opt)
    3883             : {
    3884             :         /* Valid only in SYN or SYN-ACK with an even length.  */
    3885           0 :         if (!foc || !syn || len < 0 || (len & 1))
    3886             :                 return;
    3887             : 
    3888           0 :         if (len >= TCP_FASTOPEN_COOKIE_MIN &&
    3889             :             len <= TCP_FASTOPEN_COOKIE_MAX)
    3890           0 :                 memcpy(foc->val, cookie, len);
    3891           0 :         else if (len != 0)
    3892           0 :                 len = -1;
    3893           0 :         foc->len = len;
    3894           0 :         foc->exp = exp_opt;
    3895             : }
    3896             : 
    3897           0 : static bool smc_parse_options(const struct tcphdr *th,
    3898             :                               struct tcp_options_received *opt_rx,
    3899             :                               const unsigned char *ptr,
    3900             :                               int opsize)
    3901             : {
    3902             : #if IS_ENABLED(CONFIG_SMC)
    3903             :         if (static_branch_unlikely(&tcp_have_smc)) {
    3904             :                 if (th->syn && !(opsize & 1) &&
    3905             :                     opsize >= TCPOLEN_EXP_SMC_BASE &&
    3906             :                     get_unaligned_be32(ptr) == TCPOPT_SMC_MAGIC) {
    3907             :                         opt_rx->smc_ok = 1;
    3908             :                         return true;
    3909             :                 }
    3910             :         }
    3911             : #endif
    3912           0 :         return false;
    3913             : }
    3914             : 
    3915             : /* Try to parse the MSS option from the TCP header. Return 0 on failure, clamped
    3916             :  * value on success.
    3917             :  */
    3918           0 : static u16 tcp_parse_mss_option(const struct tcphdr *th, u16 user_mss)
    3919             : {
    3920           0 :         const unsigned char *ptr = (const unsigned char *)(th + 1);
    3921           0 :         int length = (th->doff * 4) - sizeof(struct tcphdr);
    3922           0 :         u16 mss = 0;
    3923             : 
    3924           0 :         while (length > 0) {
    3925           0 :                 int opcode = *ptr++;
    3926           0 :                 int opsize;
    3927             : 
    3928           0 :                 switch (opcode) {
    3929             :                 case TCPOPT_EOL:
    3930             :                         return mss;
    3931           0 :                 case TCPOPT_NOP:        /* Ref: RFC 793 section 3.1 */
    3932           0 :                         length--;
    3933           0 :                         continue;
    3934           0 :                 default:
    3935           0 :                         if (length < 2)
    3936           0 :                                 return mss;
    3937           0 :                         opsize = *ptr++;
    3938           0 :                         if (opsize < 2) /* "silly options" */
    3939           0 :                                 return mss;
    3940           0 :                         if (opsize > length)
    3941           0 :                                 return mss;     /* fail on partial options */
    3942           0 :                         if (opcode == TCPOPT_MSS && opsize == TCPOLEN_MSS) {
    3943           0 :                                 u16 in_mss = get_unaligned_be16(ptr);
    3944             : 
    3945           0 :                                 if (in_mss) {
    3946           0 :                                         if (user_mss && user_mss < in_mss)
    3947           0 :                                                 in_mss = user_mss;
    3948             :                                         mss = in_mss;
    3949             :                                 }
    3950             :                         }
    3951           0 :                         ptr += opsize - 2;
    3952           0 :                         length -= opsize;
    3953             :                 }
    3954             :         }
    3955             :         return mss;
    3956             : }
    3957             : 
    3958             : /* Look for tcp options. Normally only called on SYN and SYNACK packets.
    3959             :  * But, this can also be called on packets in the established flow when
    3960             :  * the fast version below fails.
    3961             :  */
    3962           4 : void tcp_parse_options(const struct net *net,
    3963             :                        const struct sk_buff *skb,
    3964             :                        struct tcp_options_received *opt_rx, int estab,
    3965             :                        struct tcp_fastopen_cookie *foc)
    3966             : {
    3967           4 :         const unsigned char *ptr;
    3968           4 :         const struct tcphdr *th = tcp_hdr(skb);
    3969           4 :         int length = (th->doff * 4) - sizeof(struct tcphdr);
    3970             : 
    3971           4 :         ptr = (const unsigned char *)(th + 1);
    3972           4 :         opt_rx->saw_tstamp = 0;
    3973           4 :         opt_rx->saw_unknown = 0;
    3974             : 
    3975           8 :         while (length > 0) {
    3976           4 :                 int opcode = *ptr++;
    3977           4 :                 int opsize;
    3978             : 
    3979           4 :                 switch (opcode) {
    3980             :                 case TCPOPT_EOL:
    3981             :                         return;
    3982           0 :                 case TCPOPT_NOP:        /* Ref: RFC 793 section 3.1 */
    3983           0 :                         length--;
    3984           0 :                         continue;
    3985           4 :                 default:
    3986           4 :                         if (length < 2)
    3987             :                                 return;
    3988           4 :                         opsize = *ptr++;
    3989           4 :                         if (opsize < 2) /* "silly options" */
    3990             :                                 return;
    3991           4 :                         if (opsize > length)
    3992             :                                 return; /* don't parse partial options */
    3993           4 :                         switch (opcode) {
    3994           4 :                         case TCPOPT_MSS:
    3995           4 :                                 if (opsize == TCPOLEN_MSS && th->syn && !estab) {
    3996           4 :                                         u16 in_mss = get_unaligned_be16(ptr);
    3997           4 :                                         if (in_mss) {
    3998           4 :                                                 if (opt_rx->user_mss &&
    3999             :                                                     opt_rx->user_mss < in_mss)
    4000             :                                                         in_mss = opt_rx->user_mss;
    4001           4 :                                                 opt_rx->mss_clamp = in_mss;
    4002             :                                         }
    4003             :                                 }
    4004             :                                 break;
    4005           0 :                         case TCPOPT_WINDOW:
    4006           0 :                                 if (opsize == TCPOLEN_WINDOW && th->syn &&
    4007           0 :                                     !estab && net->ipv4.sysctl_tcp_window_scaling) {
    4008           0 :                                         __u8 snd_wscale = *(__u8 *)ptr;
    4009           0 :                                         opt_rx->wscale_ok = 1;
    4010           0 :                                         if (snd_wscale > TCP_MAX_WSCALE) {
    4011           0 :                                                 net_info_ratelimited("%s: Illegal window scaling value %d > %u received\n",
    4012             :                                                                      __func__,
    4013             :                                                                      snd_wscale,
    4014             :                                                                      TCP_MAX_WSCALE);
    4015             :                                                 snd_wscale = TCP_MAX_WSCALE;
    4016             :                                         }
    4017           0 :                                         opt_rx->snd_wscale = snd_wscale;
    4018             :                                 }
    4019             :                                 break;
    4020           0 :                         case TCPOPT_TIMESTAMP:
    4021           0 :                                 if ((opsize == TCPOLEN_TIMESTAMP) &&
    4022           0 :                                     ((estab && opt_rx->tstamp_ok) ||
    4023           0 :                                      (!estab && net->ipv4.sysctl_tcp_timestamps))) {
    4024           0 :                                         opt_rx->saw_tstamp = 1;
    4025           0 :                                         opt_rx->rcv_tsval = get_unaligned_be32(ptr);
    4026           0 :                                         opt_rx->rcv_tsecr = get_unaligned_be32(ptr + 4);
    4027             :                                 }
    4028             :                                 break;
    4029           0 :                         case TCPOPT_SACK_PERM:
    4030           0 :                                 if (opsize == TCPOLEN_SACK_PERM && th->syn &&
    4031           0 :                                     !estab && net->ipv4.sysctl_tcp_sack) {
    4032           0 :                                         opt_rx->sack_ok = TCP_SACK_SEEN;
    4033           0 :                                         tcp_sack_reset(opt_rx);
    4034             :                                 }
    4035             :                                 break;
    4036             : 
    4037           0 :                         case TCPOPT_SACK:
    4038           0 :                                 if ((opsize >= (TCPOLEN_SACK_BASE + TCPOLEN_SACK_PERBLOCK)) &&
    4039           0 :                                    !((opsize - TCPOLEN_SACK_BASE) % TCPOLEN_SACK_PERBLOCK) &&
    4040             :                                    opt_rx->sack_ok) {
    4041           0 :                                         TCP_SKB_CB(skb)->sacked = (ptr - 2) - (unsigned char *)th;
    4042             :                                 }
    4043             :                                 break;
    4044             : #ifdef CONFIG_TCP_MD5SIG
    4045             :                         case TCPOPT_MD5SIG:
    4046             :                                 /*
    4047             :                                  * The MD5 Hash has already been
    4048             :                                  * checked (see tcp_v{4,6}_do_rcv()).
    4049             :                                  */
    4050             :                                 break;
    4051             : #endif
    4052           0 :                         case TCPOPT_FASTOPEN:
    4053           0 :                                 tcp_parse_fastopen_option(
    4054             :                                         opsize - TCPOLEN_FASTOPEN_BASE,
    4055           0 :                                         ptr, th->syn, foc, false);
    4056           0 :                                 break;
    4057             : 
    4058           0 :                         case TCPOPT_EXP:
    4059             :                                 /* Fast Open option shares code 254 using a
    4060             :                                  * 16 bits magic number.
    4061             :                                  */
    4062           0 :                                 if (opsize >= TCPOLEN_EXP_FASTOPEN_BASE &&
    4063           0 :                                     get_unaligned_be16(ptr) ==
    4064             :                                     TCPOPT_FASTOPEN_MAGIC) {
    4065           0 :                                         tcp_parse_fastopen_option(opsize -
    4066             :                                                 TCPOLEN_EXP_FASTOPEN_BASE,
    4067           0 :                                                 ptr + 2, th->syn, foc, true);
    4068           0 :                                         break;
    4069             :                                 }
    4070             : 
    4071           0 :                                 if (smc_parse_options(th, opt_rx, ptr, opsize))
    4072             :                                         break;
    4073             : 
    4074           0 :                                 opt_rx->saw_unknown = 1;
    4075           0 :                                 break;
    4076             : 
    4077           0 :                         default:
    4078           0 :                                 opt_rx->saw_unknown = 1;
    4079             :                         }
    4080           4 :                         ptr += opsize-2;
    4081           4 :                         length -= opsize;
    4082             :                 }
    4083             :         }
    4084             : }
    4085             : EXPORT_SYMBOL(tcp_parse_options);
    4086             : 
    4087           0 : static bool tcp_parse_aligned_timestamp(struct tcp_sock *tp, const struct tcphdr *th)
    4088             : {
    4089           0 :         const __be32 *ptr = (const __be32 *)(th + 1);
    4090             : 
    4091           0 :         if (*ptr == htonl((TCPOPT_NOP << 24) | (TCPOPT_NOP << 16)
    4092             :                           | (TCPOPT_TIMESTAMP << 8) | TCPOLEN_TIMESTAMP)) {
    4093           0 :                 tp->rx_opt.saw_tstamp = 1;
    4094           0 :                 ++ptr;
    4095           0 :                 tp->rx_opt.rcv_tsval = ntohl(*ptr);
    4096           0 :                 ++ptr;
    4097           0 :                 if (*ptr)
    4098           0 :                         tp->rx_opt.rcv_tsecr = ntohl(*ptr) - tp->tsoffset;
    4099             :                 else
    4100           0 :                         tp->rx_opt.rcv_tsecr = 0;
    4101           0 :                 return true;
    4102             :         }
    4103             :         return false;
    4104             : }
    4105             : 
    4106             : /* Fast parse options. This hopes to only see timestamps.
    4107             :  * If it is wrong it falls back on tcp_parse_options().
    4108             :  */
    4109          10 : static bool tcp_fast_parse_options(const struct net *net,
    4110             :                                    const struct sk_buff *skb,
    4111             :                                    const struct tcphdr *th, struct tcp_sock *tp)
    4112             : {
    4113             :         /* In the spirit of fast parsing, compare doff directly to constant
    4114             :          * values.  Because equality is used, short doff can be ignored here.
    4115             :          */
    4116          10 :         if (th->doff == (sizeof(*th) / 4)) {
    4117          10 :                 tp->rx_opt.saw_tstamp = 0;
    4118          10 :                 return false;
    4119           0 :         } else if (tp->rx_opt.tstamp_ok &&
    4120             :                    th->doff == ((sizeof(*th) + TCPOLEN_TSTAMP_ALIGNED) / 4)) {
    4121           0 :                 if (tcp_parse_aligned_timestamp(tp, th))
    4122             :                         return true;
    4123             :         }
    4124             : 
    4125           0 :         tcp_parse_options(net, skb, &tp->rx_opt, 1, NULL);
    4126           0 :         if (tp->rx_opt.saw_tstamp && tp->rx_opt.rcv_tsecr)
    4127           0 :                 tp->rx_opt.rcv_tsecr -= tp->tsoffset;
    4128             : 
    4129             :         return true;
    4130             : }
    4131             : 
    4132             : #ifdef CONFIG_TCP_MD5SIG
    4133             : /*
    4134             :  * Parse MD5 Signature option
    4135             :  */
    4136             : const u8 *tcp_parse_md5sig_option(const struct tcphdr *th)
    4137             : {
    4138             :         int length = (th->doff << 2) - sizeof(*th);
    4139             :         const u8 *ptr = (const u8 *)(th + 1);
    4140             : 
    4141             :         /* If not enough data remaining, we can short cut */
    4142             :         while (length >= TCPOLEN_MD5SIG) {
    4143             :                 int opcode = *ptr++;
    4144             :                 int opsize;
    4145             : 
    4146             :                 switch (opcode) {
    4147             :                 case TCPOPT_EOL:
    4148             :                         return NULL;
    4149             :                 case TCPOPT_NOP:
    4150             :                         length--;
    4151             :                         continue;
    4152             :                 default:
    4153             :                         opsize = *ptr++;
    4154             :                         if (opsize < 2 || opsize > length)
    4155             :                                 return NULL;
    4156             :                         if (opcode == TCPOPT_MD5SIG)
    4157             :                                 return opsize == TCPOLEN_MD5SIG ? ptr : NULL;
    4158             :                 }
    4159             :                 ptr += opsize - 2;
    4160             :                 length -= opsize;
    4161             :         }
    4162             :         return NULL;
    4163             : }
    4164             : EXPORT_SYMBOL(tcp_parse_md5sig_option);
    4165             : #endif
    4166             : 
    4167             : /* Sorry, PAWS as specified is broken wrt. pure-ACKs -DaveM
    4168             :  *
    4169             :  * It is not fatal. If this ACK does _not_ change critical state (seqs, window)
    4170             :  * it can pass through stack. So, the following predicate verifies that
    4171             :  * this segment is not used for anything but congestion avoidance or
    4172             :  * fast retransmit. Moreover, we even are able to eliminate most of such
    4173             :  * second order effects, if we apply some small "replay" window (~RTO)
    4174             :  * to timestamp space.
    4175             :  *
    4176             :  * All these measures still do not guarantee that we reject wrapped ACKs
    4177             :  * on networks with high bandwidth, when sequence space is recycled fastly,
    4178             :  * but it guarantees that such events will be very rare and do not affect
    4179             :  * connection seriously. This doesn't look nice, but alas, PAWS is really
    4180             :  * buggy extension.
    4181             :  *
    4182             :  * [ Later note. Even worse! It is buggy for segments _with_ data. RFC
    4183             :  * states that events when retransmit arrives after original data are rare.
    4184             :  * It is a blatant lie. VJ forgot about fast retransmit! 8)8) It is
    4185             :  * the biggest problem on large power networks even with minor reordering.
    4186             :  * OK, let's give it small replay window. If peer clock is even 1hz, it is safe
    4187             :  * up to bandwidth of 18Gigabit/sec. 8) ]
    4188             :  */
    4189             : 
    4190           0 : static int tcp_disordered_ack(const struct sock *sk, const struct sk_buff *skb)
    4191             : {
    4192           0 :         const struct tcp_sock *tp = tcp_sk(sk);
    4193           0 :         const struct tcphdr *th = tcp_hdr(skb);
    4194           0 :         u32 seq = TCP_SKB_CB(skb)->seq;
    4195           0 :         u32 ack = TCP_SKB_CB(skb)->ack_seq;
    4196             : 
    4197           0 :         return (/* 1. Pure ACK with correct sequence number. */
    4198           0 :                 (th->ack && seq == TCP_SKB_CB(skb)->end_seq && seq == tp->rcv_nxt) &&
    4199             : 
    4200             :                 /* 2. ... and duplicate ACK. */
    4201           0 :                 ack == tp->snd_una &&
    4202             : 
    4203             :                 /* 3. ... and does not update window. */
    4204           0 :                 !tcp_may_update_window(tp, ack, seq, ntohs(th->window) << tp->rx_opt.snd_wscale) &&
    4205             : 
    4206             :                 /* 4. ... and sits in replay window. */
    4207           0 :                 (s32)(tp->rx_opt.ts_recent - tp->rx_opt.rcv_tsval) <= (inet_csk(sk)->icsk_rto * 1024) / HZ);
    4208             : }
    4209             : 
    4210           0 : static inline bool tcp_paws_discard(const struct sock *sk,
    4211             :                                    const struct sk_buff *skb)
    4212             : {
    4213           0 :         const struct tcp_sock *tp = tcp_sk(sk);
    4214             : 
    4215           0 :         return !tcp_paws_check(&tp->rx_opt, TCP_PAWS_WINDOW) &&
    4216           0 :                !tcp_disordered_ack(sk, skb);
    4217             : }
    4218             : 
    4219             : /* Check segment sequence number for validity.
    4220             :  *
    4221             :  * Segment controls are considered valid, if the segment
    4222             :  * fits to the window after truncation to the window. Acceptability
    4223             :  * of data (and SYN, FIN, of course) is checked separately.
    4224             :  * See tcp_data_queue(), for example.
    4225             :  *
    4226             :  * Also, controls (RST is main one) are accepted using RCV.WUP instead
    4227             :  * of RCV.NXT. Peer still did not advance his SND.UNA when we
    4228             :  * delayed ACK, so that hisSND.UNA<=ourRCV.WUP.
    4229             :  * (borrowed from freebsd)
    4230             :  */
    4231             : 
    4232          10 : static inline bool tcp_sequence(const struct tcp_sock *tp, u32 seq, u32 end_seq)
    4233             : {
    4234          10 :         return  !before(end_seq, tp->rcv_wup) &&
    4235          10 :                 !after(seq, tp->rcv_nxt + tcp_receive_window(tp));
    4236             : }
    4237             : 
    4238             : /* When we get a reset we do this. */
    4239           0 : void tcp_reset(struct sock *sk, struct sk_buff *skb)
    4240             : {
    4241           0 :         trace_tcp_receive_reset(sk);
    4242             : 
    4243           0 :         if (sk_is_mptcp(sk))
    4244           0 :                 mptcp_incoming_options(sk, skb);
    4245             : 
    4246             :         /* We want the right error as BSD sees it (and indeed as we do). */
    4247           0 :         switch (sk->sk_state) {
    4248           0 :         case TCP_SYN_SENT:
    4249           0 :                 sk->sk_err = ECONNREFUSED;
    4250           0 :                 break;
    4251           0 :         case TCP_CLOSE_WAIT:
    4252           0 :                 sk->sk_err = EPIPE;
    4253           0 :                 break;
    4254             :         case TCP_CLOSE:
    4255             :                 return;
    4256           0 :         default:
    4257           0 :                 sk->sk_err = ECONNRESET;
    4258             :         }
    4259             :         /* This barrier is coupled with smp_rmb() in tcp_poll() */
    4260           0 :         smp_wmb();
    4261             : 
    4262           0 :         tcp_write_queue_purge(sk);
    4263           0 :         tcp_done(sk);
    4264             : 
    4265           0 :         if (!sock_flag(sk, SOCK_DEAD))
    4266           0 :                 sk->sk_error_report(sk);
    4267             : }
    4268             : 
    4269             : /*
    4270             :  *      Process the FIN bit. This now behaves as it is supposed to work
    4271             :  *      and the FIN takes effect when it is validly part of sequence
    4272             :  *      space. Not before when we get holes.
    4273             :  *
    4274             :  *      If we are ESTABLISHED, a received fin moves us to CLOSE-WAIT
    4275             :  *      (and thence onto LAST-ACK and finally, CLOSE, we never enter
    4276             :  *      TIME-WAIT)
    4277             :  *
    4278             :  *      If we are in FINWAIT-1, a received FIN indicates simultaneous
    4279             :  *      close and we go into CLOSING (and later onto TIME-WAIT)
    4280             :  *
    4281             :  *      If we are in FINWAIT-2, a received FIN moves us to TIME-WAIT.
    4282             :  */
    4283           3 : void tcp_fin(struct sock *sk)
    4284             : {
    4285           3 :         struct tcp_sock *tp = tcp_sk(sk);
    4286             : 
    4287           3 :         inet_csk_schedule_ack(sk);
    4288             : 
    4289           3 :         sk->sk_shutdown |= RCV_SHUTDOWN;
    4290           3 :         sock_set_flag(sk, SOCK_DONE);
    4291             : 
    4292           3 :         switch (sk->sk_state) {
    4293           3 :         case TCP_SYN_RECV:
    4294             :         case TCP_ESTABLISHED:
    4295             :                 /* Move to CLOSE_WAIT */
    4296           3 :                 tcp_set_state(sk, TCP_CLOSE_WAIT);
    4297           3 :                 inet_csk_enter_pingpong_mode(sk);
    4298             :                 break;
    4299             : 
    4300             :         case TCP_CLOSE_WAIT:
    4301             :         case TCP_CLOSING:
    4302             :                 /* Received a retransmission of the FIN, do
    4303             :                  * nothing.
    4304             :                  */
    4305             :                 break;
    4306             :         case TCP_LAST_ACK:
    4307             :                 /* RFC793: Remain in the LAST-ACK state. */
    4308             :                 break;
    4309             : 
    4310           0 :         case TCP_FIN_WAIT1:
    4311             :                 /* This case occurs when a simultaneous close
    4312             :                  * happens, we must ack the received FIN and
    4313             :                  * enter the CLOSING state.
    4314             :                  */
    4315           0 :                 tcp_send_ack(sk);
    4316           0 :                 tcp_set_state(sk, TCP_CLOSING);
    4317           0 :                 break;
    4318           0 :         case TCP_FIN_WAIT2:
    4319             :                 /* Received a FIN -- send ACK and enter TIME_WAIT. */
    4320           0 :                 tcp_send_ack(sk);
    4321           0 :                 tcp_time_wait(sk, TCP_TIME_WAIT, 0);
    4322           0 :                 break;
    4323           0 :         default:
    4324             :                 /* Only TCP_LISTEN and TCP_CLOSE are left, in these
    4325             :                  * cases we should never reach this piece of code.
    4326             :                  */
    4327           0 :                 pr_err("%s: Impossible, sk->sk_state=%d\n",
    4328             :                        __func__, sk->sk_state);
    4329           0 :                 break;
    4330             :         }
    4331             : 
    4332             :         /* It _is_ possible, that we have something out-of-order _after_ FIN.
    4333             :          * Probably, we should reset in this case. For now drop them.
    4334             :          */
    4335           3 :         skb_rbtree_purge(&tp->out_of_order_queue);
    4336           3 :         if (tcp_is_sack(tp))
    4337           0 :                 tcp_sack_reset(&tp->rx_opt);
    4338           3 :         sk_mem_reclaim(sk);
    4339             : 
    4340           3 :         if (!sock_flag(sk, SOCK_DEAD)) {
    4341           3 :                 sk->sk_state_change(sk);
    4342             : 
    4343             :                 /* Do not send POLL_HUP for half duplex close. */
    4344           3 :                 if (sk->sk_shutdown == SHUTDOWN_MASK ||
    4345           3 :                     sk->sk_state == TCP_CLOSE)
    4346           0 :                         sk_wake_async(sk, SOCK_WAKE_WAITD, POLL_HUP);
    4347             :                 else
    4348           3 :                         sk_wake_async(sk, SOCK_WAKE_WAITD, POLL_IN);
    4349             :         }
    4350           3 : }
    4351             : 
    4352           0 : static inline bool tcp_sack_extend(struct tcp_sack_block *sp, u32 seq,
    4353             :                                   u32 end_seq)
    4354             : {
    4355           0 :         if (!after(seq, sp->end_seq) && !after(sp->start_seq, end_seq)) {
    4356           0 :                 if (before(seq, sp->start_seq))
    4357           0 :                         sp->start_seq = seq;
    4358           0 :                 if (after(end_seq, sp->end_seq))
    4359           0 :                         sp->end_seq = end_seq;
    4360           0 :                 return true;
    4361             :         }
    4362             :         return false;
    4363             : }
    4364             : 
    4365           0 : static void tcp_dsack_set(struct sock *sk, u32 seq, u32 end_seq)
    4366             : {
    4367           0 :         struct tcp_sock *tp = tcp_sk(sk);
    4368             : 
    4369           0 :         if (tcp_is_sack(tp) && sock_net(sk)->ipv4.sysctl_tcp_dsack) {
    4370           0 :                 int mib_idx;
    4371             : 
    4372           0 :                 if (before(seq, tp->rcv_nxt))
    4373             :                         mib_idx = LINUX_MIB_TCPDSACKOLDSENT;
    4374             :                 else
    4375           0 :                         mib_idx = LINUX_MIB_TCPDSACKOFOSENT;
    4376             : 
    4377           0 :                 NET_INC_STATS(sock_net(sk), mib_idx);
    4378             : 
    4379           0 :                 tp->rx_opt.dsack = 1;
    4380           0 :                 tp->duplicate_sack[0].start_seq = seq;
    4381           0 :                 tp->duplicate_sack[0].end_seq = end_seq;
    4382             :         }
    4383           0 : }
    4384             : 
    4385           0 : static void tcp_dsack_extend(struct sock *sk, u32 seq, u32 end_seq)
    4386             : {
    4387           0 :         struct tcp_sock *tp = tcp_sk(sk);
    4388             : 
    4389           0 :         if (!tp->rx_opt.dsack)
    4390           0 :                 tcp_dsack_set(sk, seq, end_seq);
    4391             :         else
    4392           0 :                 tcp_sack_extend(tp->duplicate_sack, seq, end_seq);
    4393           0 : }
    4394             : 
    4395           0 : static void tcp_rcv_spurious_retrans(struct sock *sk, const struct sk_buff *skb)
    4396             : {
    4397             :         /* When the ACK path fails or drops most ACKs, the sender would
    4398             :          * timeout and spuriously retransmit the same segment repeatedly.
    4399             :          * The receiver remembers and reflects via DSACKs. Leverage the
    4400             :          * DSACK state and change the txhash to re-route speculatively.
    4401             :          */
    4402           0 :         if (TCP_SKB_CB(skb)->seq == tcp_sk(sk)->duplicate_sack[0].start_seq &&
    4403           0 :             sk_rethink_txhash(sk))
    4404           0 :                 NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPDUPLICATEDATAREHASH);
    4405           0 : }
    4406             : 
    4407           0 : static void tcp_send_dupack(struct sock *sk, const struct sk_buff *skb)
    4408             : {
    4409           0 :         struct tcp_sock *tp = tcp_sk(sk);
    4410             : 
    4411           0 :         if (TCP_SKB_CB(skb)->end_seq != TCP_SKB_CB(skb)->seq &&
    4412           0 :             before(TCP_SKB_CB(skb)->seq, tp->rcv_nxt)) {
    4413           0 :                 NET_INC_STATS(sock_net(sk), LINUX_MIB_DELAYEDACKLOST);
    4414           0 :                 tcp_enter_quickack_mode(sk, TCP_MAX_QUICKACKS);
    4415             : 
    4416           0 :                 if (tcp_is_sack(tp) && sock_net(sk)->ipv4.sysctl_tcp_dsack) {
    4417           0 :                         u32 end_seq = TCP_SKB_CB(skb)->end_seq;
    4418             : 
    4419           0 :                         tcp_rcv_spurious_retrans(sk, skb);
    4420           0 :                         if (after(TCP_SKB_CB(skb)->end_seq, tp->rcv_nxt))
    4421           0 :                                 end_seq = tp->rcv_nxt;
    4422           0 :                         tcp_dsack_set(sk, TCP_SKB_CB(skb)->seq, end_seq);
    4423             :                 }
    4424             :         }
    4425             : 
    4426           0 :         tcp_send_ack(sk);
    4427           0 : }
    4428             : 
    4429             : /* These routines update the SACK block as out-of-order packets arrive or
    4430             :  * in-order packets close up the sequence space.
    4431             :  */
    4432           0 : static void tcp_sack_maybe_coalesce(struct tcp_sock *tp)
    4433             : {
    4434           0 :         int this_sack;
    4435           0 :         struct tcp_sack_block *sp = &tp->selective_acks[0];
    4436           0 :         struct tcp_sack_block *swalk = sp + 1;
    4437             : 
    4438             :         /* See if the recent change to the first SACK eats into
    4439             :          * or hits the sequence space of other SACK blocks, if so coalesce.
    4440             :          */
    4441           0 :         for (this_sack = 1; this_sack < tp->rx_opt.num_sacks;) {
    4442           0 :                 if (tcp_sack_extend(sp, swalk->start_seq, swalk->end_seq)) {
    4443           0 :                         int i;
    4444             : 
    4445             :                         /* Zap SWALK, by moving every further SACK up by one slot.
    4446             :                          * Decrease num_sacks.
    4447             :                          */
    4448           0 :                         tp->rx_opt.num_sacks--;
    4449           0 :                         for (i = this_sack; i < tp->rx_opt.num_sacks; i++)
    4450           0 :                                 sp[i] = sp[i + 1];
    4451           0 :                         continue;
    4452             :                 }
    4453           0 :                 this_sack++;
    4454           0 :                 swalk++;
    4455             :         }
    4456           0 : }
    4457             : 
    4458           0 : static void tcp_sack_compress_send_ack(struct sock *sk)
    4459             : {
    4460           0 :         struct tcp_sock *tp = tcp_sk(sk);
    4461             : 
    4462           0 :         if (!tp->compressed_ack)
    4463             :                 return;
    4464             : 
    4465           0 :         if (hrtimer_try_to_cancel(&tp->compressed_ack_timer) == 1)
    4466           0 :                 __sock_put(sk);
    4467             : 
    4468             :         /* Since we have to send one ack finally,
    4469             :          * substract one from tp->compressed_ack to keep
    4470             :          * LINUX_MIB_TCPACKCOMPRESSED accurate.
    4471             :          */
    4472           0 :         NET_ADD_STATS(sock_net(sk), LINUX_MIB_TCPACKCOMPRESSED,
    4473             :                       tp->compressed_ack - 1);
    4474             : 
    4475           0 :         tp->compressed_ack = 0;
    4476           0 :         tcp_send_ack(sk);
    4477             : }
    4478             : 
    4479             : /* Reasonable amount of sack blocks included in TCP SACK option
    4480             :  * The max is 4, but this becomes 3 if TCP timestamps are there.
    4481             :  * Given that SACK packets might be lost, be conservative and use 2.
    4482             :  */
    4483             : #define TCP_SACK_BLOCKS_EXPECTED 2
    4484             : 
    4485           0 : static void tcp_sack_new_ofo_skb(struct sock *sk, u32 seq, u32 end_seq)
    4486             : {
    4487           0 :         struct tcp_sock *tp = tcp_sk(sk);
    4488           0 :         struct tcp_sack_block *sp = &tp->selective_acks[0];
    4489           0 :         int cur_sacks = tp->rx_opt.num_sacks;
    4490           0 :         int this_sack;
    4491             : 
    4492           0 :         if (!cur_sacks)
    4493           0 :                 goto new_sack;
    4494             : 
    4495           0 :         for (this_sack = 0; this_sack < cur_sacks; this_sack++, sp++) {
    4496           0 :                 if (tcp_sack_extend(sp, seq, end_seq)) {
    4497           0 :                         if (this_sack >= TCP_SACK_BLOCKS_EXPECTED)
    4498           0 :                                 tcp_sack_compress_send_ack(sk);
    4499             :                         /* Rotate this_sack to the first one. */
    4500           0 :                         for (; this_sack > 0; this_sack--, sp--)
    4501           0 :                                 swap(*sp, *(sp - 1));
    4502           0 :                         if (cur_sacks > 1)
    4503           0 :                                 tcp_sack_maybe_coalesce(tp);
    4504           0 :                         return;
    4505             :                 }
    4506             :         }
    4507             : 
    4508           0 :         if (this_sack >= TCP_SACK_BLOCKS_EXPECTED)
    4509           0 :                 tcp_sack_compress_send_ack(sk);
    4510             : 
    4511             :         /* Could not find an adjacent existing SACK, build a new one,
    4512             :          * put it at the front, and shift everyone else down.  We
    4513             :          * always know there is at least one SACK present already here.
    4514             :          *
    4515             :          * If the sack array is full, forget about the last one.
    4516             :          */
    4517           0 :         if (this_sack >= TCP_NUM_SACKS) {
    4518           0 :                 this_sack--;
    4519           0 :                 tp->rx_opt.num_sacks--;
    4520           0 :                 sp--;
    4521             :         }
    4522           0 :         for (; this_sack > 0; this_sack--, sp--)
    4523           0 :                 *sp = *(sp - 1);
    4524             : 
    4525           0 : new_sack:
    4526             :         /* Build the new head SACK, and we're done. */
    4527           0 :         sp->start_seq = seq;
    4528           0 :         sp->end_seq = end_seq;
    4529           0 :         tp->rx_opt.num_sacks++;
    4530             : }
    4531             : 
    4532             : /* RCV.NXT advances, some SACKs should be eaten. */
    4533             : 
    4534           0 : static void tcp_sack_remove(struct tcp_sock *tp)
    4535             : {
    4536           0 :         struct tcp_sack_block *sp = &tp->selective_acks[0];
    4537           0 :         int num_sacks = tp->rx_opt.num_sacks;
    4538           0 :         int this_sack;
    4539             : 
    4540             :         /* Empty ofo queue, hence, all the SACKs are eaten. Clear. */
    4541           0 :         if (RB_EMPTY_ROOT(&tp->out_of_order_queue)) {
    4542           0 :                 tp->rx_opt.num_sacks = 0;
    4543           0 :                 return;
    4544             :         }
    4545             : 
    4546           0 :         for (this_sack = 0; this_sack < num_sacks;) {
    4547             :                 /* Check if the start of the sack is covered by RCV.NXT. */
    4548           0 :                 if (!before(tp->rcv_nxt, sp->start_seq)) {
    4549           0 :                         int i;
    4550             : 
    4551             :                         /* RCV.NXT must cover all the block! */
    4552           0 :                         WARN_ON(before(tp->rcv_nxt, sp->end_seq));
    4553             : 
    4554             :                         /* Zap this SACK, by moving forward any other SACKS. */
    4555           0 :                         for (i = this_sack+1; i < num_sacks; i++)
    4556           0 :                                 tp->selective_acks[i-1] = tp->selective_acks[i];
    4557           0 :                         num_sacks--;
    4558           0 :                         continue;
    4559             :                 }
    4560           0 :                 this_sack++;
    4561           0 :                 sp++;
    4562             :         }
    4563           0 :         tp->rx_opt.num_sacks = num_sacks;
    4564             : }
    4565             : 
    4566             : /**
    4567             :  * tcp_try_coalesce - try to merge skb to prior one
    4568             :  * @sk: socket
    4569             :  * @to: prior buffer
    4570             :  * @from: buffer to add in queue
    4571             :  * @fragstolen: pointer to boolean
    4572             :  *
    4573             :  * Before queueing skb @from after @to, try to merge them
    4574             :  * to reduce overall memory use and queue lengths, if cost is small.
    4575             :  * Packets in ofo or receive queues can stay a long time.
    4576             :  * Better try to coalesce them right now to avoid future collapses.
    4577             :  * Returns true if caller should free @from instead of queueing it
    4578             :  */
    4579          16 : static bool tcp_try_coalesce(struct sock *sk,
    4580             :                              struct sk_buff *to,
    4581             :                              struct sk_buff *from,
    4582             :                              bool *fragstolen)
    4583             : {
    4584          16 :         int delta;
    4585             : 
    4586          16 :         *fragstolen = false;
    4587             : 
    4588             :         /* Its possible this segment overlaps with prior segment in queue */
    4589          16 :         if (TCP_SKB_CB(from)->seq != TCP_SKB_CB(to)->end_seq)
    4590             :                 return false;
    4591             : 
    4592          16 :         if (!mptcp_skb_can_collapse(to, from))
    4593             :                 return false;
    4594             : 
    4595             : #ifdef CONFIG_TLS_DEVICE
    4596             :         if (from->decrypted != to->decrypted)
    4597             :                 return false;
    4598             : #endif
    4599             : 
    4600          16 :         if (!skb_try_coalesce(to, from, fragstolen, &delta))
    4601             :                 return false;
    4602             : 
    4603           2 :         atomic_add(delta, &sk->sk_rmem_alloc);
    4604           2 :         sk_mem_charge(sk, delta);
    4605           2 :         NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPRCVCOALESCE);
    4606           2 :         TCP_SKB_CB(to)->end_seq = TCP_SKB_CB(from)->end_seq;
    4607           2 :         TCP_SKB_CB(to)->ack_seq = TCP_SKB_CB(from)->ack_seq;
    4608           2 :         TCP_SKB_CB(to)->tcp_flags |= TCP_SKB_CB(from)->tcp_flags;
    4609             : 
    4610           2 :         if (TCP_SKB_CB(from)->has_rxtstamp) {
    4611           0 :                 TCP_SKB_CB(to)->has_rxtstamp = true;
    4612           0 :                 to->tstamp = from->tstamp;
    4613           0 :                 skb_hwtstamps(to)->hwtstamp = skb_hwtstamps(from)->hwtstamp;
    4614             :         }
    4615             : 
    4616             :         return true;
    4617             : }
    4618             : 
    4619           0 : static bool tcp_ooo_try_coalesce(struct sock *sk,
    4620             :                              struct sk_buff *to,
    4621             :                              struct sk_buff *from,
    4622             :                              bool *fragstolen)
    4623             : {
    4624           0 :         bool res = tcp_try_coalesce(sk, to, from, fragstolen);
    4625             : 
    4626             :         /* In case tcp_drop() is called later, update to->gso_segs */
    4627           0 :         if (res) {
    4628           0 :                 u32 gso_segs = max_t(u16, 1, skb_shinfo(to)->gso_segs) +
    4629           0 :                                max_t(u16, 1, skb_shinfo(from)->gso_segs);
    4630             : 
    4631           0 :                 skb_shinfo(to)->gso_segs = min_t(u32, gso_segs, 0xFFFF);
    4632             :         }
    4633           0 :         return res;
    4634             : }
    4635             : 
    4636           3 : static void tcp_drop(struct sock *sk, struct sk_buff *skb)
    4637             : {
    4638           3 :         sk_drops_add(sk, skb);
    4639           3 :         __kfree_skb(skb);
    4640           3 : }
    4641             : 
    4642             : /* This one checks to see if we can put data from the
    4643             :  * out_of_order queue into the receive_queue.
    4644             :  */
    4645           0 : static void tcp_ofo_queue(struct sock *sk)
    4646             : {
    4647           0 :         struct tcp_sock *tp = tcp_sk(sk);
    4648           0 :         __u32 dsack_high = tp->rcv_nxt;
    4649           0 :         bool fin, fragstolen, eaten;
    4650           0 :         struct sk_buff *skb, *tail;
    4651           0 :         struct rb_node *p;
    4652             : 
    4653           0 :         p = rb_first(&tp->out_of_order_queue);
    4654           0 :         while (p) {
    4655           0 :                 skb = rb_to_skb(p);
    4656           0 :                 if (after(TCP_SKB_CB(skb)->seq, tp->rcv_nxt))
    4657             :                         break;
    4658             : 
    4659           0 :                 if (before(TCP_SKB_CB(skb)->seq, dsack_high)) {
    4660           0 :                         __u32 dsack = dsack_high;
    4661           0 :                         if (before(TCP_SKB_CB(skb)->end_seq, dsack_high))
    4662           0 :                                 dsack_high = TCP_SKB_CB(skb)->end_seq;
    4663           0 :                         tcp_dsack_extend(sk, TCP_SKB_CB(skb)->seq, dsack);
    4664             :                 }
    4665           0 :                 p = rb_next(p);
    4666           0 :                 rb_erase(&skb->rbnode, &tp->out_of_order_queue);
    4667             : 
    4668           0 :                 if (unlikely(!after(TCP_SKB_CB(skb)->end_seq, tp->rcv_nxt))) {
    4669           0 :                         tcp_drop(sk, skb);
    4670           0 :                         continue;
    4671             :                 }
    4672             : 
    4673           0 :                 tail = skb_peek_tail(&sk->sk_receive_queue);
    4674           0 :                 eaten = tail && tcp_try_coalesce(sk, tail, skb, &fragstolen);
    4675           0 :                 tcp_rcv_nxt_update(tp, TCP_SKB_CB(skb)->end_seq);
    4676           0 :                 fin = TCP_SKB_CB(skb)->tcp_flags & TCPHDR_FIN;
    4677           0 :                 if (!eaten)
    4678           0 :                         __skb_queue_tail(&sk->sk_receive_queue, skb);
    4679             :                 else
    4680           0 :                         kfree_skb_partial(skb, fragstolen);
    4681             : 
    4682           0 :                 if (unlikely(fin)) {
    4683           0 :                         tcp_fin(sk);
    4684             :                         /* tcp_fin() purges tp->out_of_order_queue,
    4685             :                          * so we must end this loop right now.
    4686             :                          */
    4687           0 :                         break;
    4688             :                 }
    4689             :         }
    4690           0 : }
    4691             : 
    4692             : static bool tcp_prune_ofo_queue(struct sock *sk);
    4693             : static int tcp_prune_queue(struct sock *sk);
    4694             : 
    4695           7 : static int tcp_try_rmem_schedule(struct sock *sk, struct sk_buff *skb,
    4696             :                                  unsigned int size)
    4697             : {
    4698          14 :         if (atomic_read(&sk->sk_rmem_alloc) > sk->sk_rcvbuf ||
    4699           7 :             !sk_rmem_schedule(sk, skb, size)) {
    4700             : 
    4701           0 :                 if (tcp_prune_queue(sk) < 0)
    4702             :                         return -1;
    4703             : 
    4704           0 :                 while (!sk_rmem_schedule(sk, skb, size)) {
    4705           0 :                         if (!tcp_prune_ofo_queue(sk))
    4706             :                                 return -1;
    4707             :                 }
    4708             :         }
    4709             :         return 0;
    4710             : }
    4711             : 
    4712           0 : static void tcp_data_queue_ofo(struct sock *sk, struct sk_buff *skb)
    4713             : {
    4714           0 :         struct tcp_sock *tp = tcp_sk(sk);
    4715           0 :         struct rb_node **p, *parent;
    4716           0 :         struct sk_buff *skb1;
    4717           0 :         u32 seq, end_seq;
    4718           0 :         bool fragstolen;
    4719             : 
    4720           0 :         tcp_ecn_check_ce(sk, skb);
    4721             : 
    4722           0 :         if (unlikely(tcp_try_rmem_schedule(sk, skb, skb->truesize))) {
    4723           0 :                 NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPOFODROP);
    4724           0 :                 sk->sk_data_ready(sk);
    4725           0 :                 tcp_drop(sk, skb);
    4726           0 :                 return;
    4727             :         }
    4728             : 
    4729             :         /* Disable header prediction. */
    4730           0 :         tp->pred_flags = 0;
    4731           0 :         inet_csk_schedule_ack(sk);
    4732             : 
    4733           0 :         tp->rcv_ooopack += max_t(u16, 1, skb_shinfo(skb)->gso_segs);
    4734           0 :         NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPOFOQUEUE);
    4735           0 :         seq = TCP_SKB_CB(skb)->seq;
    4736           0 :         end_seq = TCP_SKB_CB(skb)->end_seq;
    4737             : 
    4738           0 :         p = &tp->out_of_order_queue.rb_node;
    4739           0 :         if (RB_EMPTY_ROOT(&tp->out_of_order_queue)) {
    4740             :                 /* Initial out of order segment, build 1 SACK. */
    4741           0 :                 if (tcp_is_sack(tp)) {
    4742           0 :                         tp->rx_opt.num_sacks = 1;
    4743           0 :                         tp->selective_acks[0].start_seq = seq;
    4744           0 :                         tp->selective_acks[0].end_seq = end_seq;
    4745             :                 }
    4746           0 :                 rb_link_node(&skb->rbnode, NULL, p);
    4747           0 :                 rb_insert_color(&skb->rbnode, &tp->out_of_order_queue);
    4748           0 :                 tp->ooo_last_skb = skb;
    4749           0 :                 goto end;
    4750             :         }
    4751             : 
    4752             :         /* In the typical case, we are adding an skb to the end of the list.
    4753             :          * Use of ooo_last_skb avoids the O(Log(N)) rbtree lookup.
    4754             :          */
    4755           0 :         if (tcp_ooo_try_coalesce(sk, tp->ooo_last_skb,
    4756             :                                  skb, &fragstolen)) {
    4757           0 : coalesce_done:
    4758             :                 /* For non sack flows, do not grow window to force DUPACK
    4759             :                  * and trigger fast retransmit.
    4760             :                  */
    4761           0 :                 if (tcp_is_sack(tp))
    4762           0 :                         tcp_grow_window(sk, skb);
    4763           0 :                 kfree_skb_partial(skb, fragstolen);
    4764           0 :                 skb = NULL;
    4765           0 :                 goto add_sack;
    4766             :         }
    4767             :         /* Can avoid an rbtree lookup if we are adding skb after ooo_last_skb */
    4768           0 :         if (!before(seq, TCP_SKB_CB(tp->ooo_last_skb)->end_seq)) {
    4769           0 :                 parent = &tp->ooo_last_skb->rbnode;
    4770           0 :                 p = &parent->rb_right;
    4771           0 :                 goto insert;
    4772             :         }
    4773             : 
    4774             :         /* Find place to insert this segment. Handle overlaps on the way. */
    4775             :         parent = NULL;
    4776           0 :         while (*p) {
    4777           0 :                 parent = *p;
    4778           0 :                 skb1 = rb_to_skb(parent);
    4779           0 :                 if (before(seq, TCP_SKB_CB(skb1)->seq)) {
    4780           0 :                         p = &parent->rb_left;
    4781           0 :                         continue;
    4782             :                 }
    4783           0 :                 if (before(seq, TCP_SKB_CB(skb1)->end_seq)) {
    4784           0 :                         if (!after(end_seq, TCP_SKB_CB(skb1)->end_seq)) {
    4785             :                                 /* All the bits are present. Drop. */
    4786           0 :                                 NET_INC_STATS(sock_net(sk),
    4787             :                                               LINUX_MIB_TCPOFOMERGE);
    4788           0 :                                 tcp_drop(sk, skb);
    4789           0 :                                 skb = NULL;
    4790           0 :                                 tcp_dsack_set(sk, seq, end_seq);
    4791           0 :                                 goto add_sack;
    4792             :                         }
    4793           0 :                         if (after(seq, TCP_SKB_CB(skb1)->seq)) {
    4794             :                                 /* Partial overlap. */
    4795           0 :                                 tcp_dsack_set(sk, seq, TCP_SKB_CB(skb1)->end_seq);
    4796             :                         } else {
    4797             :                                 /* skb's seq == skb1's seq and skb covers skb1.
    4798             :                                  * Replace skb1 with skb.
    4799             :                                  */
    4800           0 :                                 rb_replace_node(&skb1->rbnode, &skb->rbnode,
    4801             :                                                 &tp->out_of_order_queue);
    4802           0 :                                 tcp_dsack_extend(sk,
    4803             :                                                  TCP_SKB_CB(skb1)->seq,
    4804             :                                                  TCP_SKB_CB(skb1)->end_seq);
    4805           0 :                                 NET_INC_STATS(sock_net(sk),
    4806             :                                               LINUX_MIB_TCPOFOMERGE);
    4807           0 :                                 tcp_drop(sk, skb1);
    4808           0 :                                 goto merge_right;
    4809             :                         }
    4810           0 :                 } else if (tcp_ooo_try_coalesce(sk, skb1,
    4811             :                                                 skb, &fragstolen)) {
    4812           0 :                         goto coalesce_done;
    4813             :                 }
    4814           0 :                 p = &parent->rb_right;
    4815             :         }
    4816           0 : insert:
    4817             :         /* Insert segment into RB tree. */
    4818           0 :         rb_link_node(&skb->rbnode, parent, p);
    4819           0 :         rb_insert_color(&skb->rbnode, &tp->out_of_order_queue);
    4820             : 
    4821             : merge_right:
    4822             :         /* Remove other segments covered by skb. */
    4823           0 :         while ((skb1 = skb_rb_next(skb)) != NULL) {
    4824           0 :                 if (!after(end_seq, TCP_SKB_CB(skb1)->seq))
    4825             :                         break;
    4826           0 :                 if (before(end_seq, TCP_SKB_CB(skb1)->end_seq)) {
    4827           0 :                         tcp_dsack_extend(sk, TCP_SKB_CB(skb1)->seq,
    4828             :                                          end_seq);
    4829           0 :                         break;
    4830             :                 }
    4831           0 :                 rb_erase(&skb1->rbnode, &tp->out_of_order_queue);
    4832           0 :                 tcp_dsack_extend(sk, TCP_SKB_CB(skb1)->seq,
    4833             :                                  TCP_SKB_CB(skb1)->end_seq);
    4834           0 :                 NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPOFOMERGE);
    4835           0 :                 tcp_drop(sk, skb1);
    4836             :         }
    4837             :         /* If there is no skb after us, we are the last_skb ! */
    4838           0 :         if (!skb1)
    4839           0 :                 tp->ooo_last_skb = skb;
    4840             : 
    4841           0 : add_sack:
    4842           0 :         if (tcp_is_sack(tp))
    4843           0 :                 tcp_sack_new_ofo_skb(sk, seq, end_seq);
    4844           0 : end:
    4845           0 :         if (skb) {
    4846             :                 /* For non sack flows, do not grow window to force DUPACK
    4847             :                  * and trigger fast retransmit.
    4848             :                  */
    4849           0 :                 if (tcp_is_sack(tp))
    4850           0 :                         tcp_grow_window(sk, skb);
    4851           0 :                 skb_condense(skb);
    4852           0 :                 skb_set_owner_r(skb, sk);
    4853             :         }
    4854             : }
    4855             : 
    4856          70 : static int __must_check tcp_queue_rcv(struct sock *sk, struct sk_buff *skb,
    4857             :                                       bool *fragstolen)
    4858             : {
    4859          70 :         int eaten;
    4860          70 :         struct sk_buff *tail = skb_peek_tail(&sk->sk_receive_queue);
    4861             : 
    4862          86 :         eaten = (tail &&
    4863          16 :                  tcp_try_coalesce(sk, tail,
    4864          32 :                                   skb, fragstolen)) ? 1 : 0;
    4865          70 :         tcp_rcv_nxt_update(tcp_sk(sk), TCP_SKB_CB(skb)->end_seq);
    4866          70 :         if (!eaten) {
    4867          68 :                 __skb_queue_tail(&sk->sk_receive_queue, skb);
    4868          68 :                 skb_set_owner_r(skb, sk);
    4869             :         }
    4870          70 :         return eaten;
    4871             : }
    4872             : 
    4873           0 : int tcp_send_rcvq(struct sock *sk, struct msghdr *msg, size_t size)
    4874             : {
    4875           0 :         struct sk_buff *skb;
    4876           0 :         int err = -ENOMEM;
    4877           0 :         int data_len = 0;
    4878           0 :         bool fragstolen;
    4879             : 
    4880           0 :         if (size == 0)
    4881             :                 return 0;
    4882             : 
    4883           0 :         if (size > PAGE_SIZE) {
    4884           0 :                 int npages = min_t(size_t, size >> PAGE_SHIFT, MAX_SKB_FRAGS);
    4885             : 
    4886           0 :                 data_len = npages << PAGE_SHIFT;
    4887           0 :                 size = data_len + (size & ~PAGE_MASK);
    4888             :         }
    4889           0 :         skb = alloc_skb_with_frags(size - data_len, data_len,
    4890             :                                    PAGE_ALLOC_COSTLY_ORDER,
    4891             :                                    &err, sk->sk_allocation);
    4892           0 :         if (!skb)
    4893           0 :                 goto err;
    4894             : 
    4895           0 :         skb_put(skb, size - data_len);
    4896           0 :         skb->data_len = data_len;
    4897           0 :         skb->len = size;
    4898             : 
    4899           0 :         if (tcp_try_rmem_schedule(sk, skb, skb->truesize)) {
    4900           0 :                 NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPRCVQDROP);
    4901           0 :                 goto err_free;
    4902             :         }
    4903             : 
    4904           0 :         err = skb_copy_datagram_from_iter(skb, 0, &msg->msg_iter, size);
    4905           0 :         if (err)
    4906           0 :                 goto err_free;
    4907             : 
    4908           0 :         TCP_SKB_CB(skb)->seq = tcp_sk(sk)->rcv_nxt;
    4909           0 :         TCP_SKB_CB(skb)->end_seq = TCP_SKB_CB(skb)->seq + size;
    4910           0 :         TCP_SKB_CB(skb)->ack_seq = tcp_sk(sk)->snd_una - 1;
    4911             : 
    4912           0 :         if (tcp_queue_rcv(sk, skb, &fragstolen)) {
    4913           0 :                 WARN_ON_ONCE(fragstolen); /* should not happen */
    4914           0 :                 __kfree_skb(skb);
    4915             :         }
    4916             :         return size;
    4917             : 
    4918           0 : err_free:
    4919           0 :         kfree_skb(skb);
    4920           0 : err:
    4921           0 :         return err;
    4922             : 
    4923             : }
    4924             : 
    4925          70 : void tcp_data_ready(struct sock *sk)
    4926             : {
    4927          70 :         if (tcp_epollin_ready(sk, sk->sk_rcvlowat) || sock_flag(sk, SOCK_DONE))
    4928          70 :                 sk->sk_data_ready(sk);
    4929          70 : }
    4930             : 
    4931          19 : static void tcp_data_queue(struct sock *sk, struct sk_buff *skb)
    4932             : {
    4933          19 :         struct tcp_sock *tp = tcp_sk(sk);
    4934          19 :         bool fragstolen;
    4935          19 :         int eaten;
    4936             : 
    4937          19 :         if (sk_is_mptcp(sk))
    4938          19 :                 mptcp_incoming_options(sk, skb);
    4939             : 
    4940          19 :         if (TCP_SKB_CB(skb)->seq == TCP_SKB_CB(skb)->end_seq) {
    4941           4 :                 __kfree_skb(skb);
    4942          23 :                 return;
    4943             :         }
    4944          15 :         skb_dst_drop(skb);
    4945          15 :         __skb_pull(skb, tcp_hdr(skb)->doff * 4);
    4946             : 
    4947          15 :         tp->rx_opt.dsack = 0;
    4948             : 
    4949             :         /*  Queue data for delivery to the user.
    4950             :          *  Packets in sequence go to the receive queue.
    4951             :          *  Out of sequence packets to the out_of_order_queue.
    4952             :          */
    4953          15 :         if (TCP_SKB_CB(skb)->seq == tp->rcv_nxt) {
    4954          15 :                 if (tcp_receive_window(tp) == 0) {
    4955           0 :                         NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPZEROWINDOWDROP);
    4956           0 :                         goto out_of_window;
    4957             :                 }
    4958             : 
    4959             :                 /* Ok. In sequence. In window. */
    4960          15 : queue_and_out:
    4961          15 :                 if (skb_queue_len(&sk->sk_receive_queue) == 0)
    4962           8 :                         sk_forced_mem_schedule(sk, skb->truesize);
    4963           7 :                 else if (tcp_try_rmem_schedule(sk, skb, skb->truesize)) {
    4964           0 :                         NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPRCVQDROP);
    4965           0 :                         sk->sk_data_ready(sk);
    4966           0 :                         goto drop;
    4967             :                 }
    4968             : 
    4969          15 :                 eaten = tcp_queue_rcv(sk, skb, &fragstolen);
    4970          15 :                 if (skb->len)
    4971          12 :                         tcp_event_data_recv(sk, skb);
    4972          15 :                 if (TCP_SKB_CB(skb)->tcp_flags & TCPHDR_FIN)
    4973           3 :                         tcp_fin(sk);
    4974             : 
    4975          15 :                 if (!RB_EMPTY_ROOT(&tp->out_of_order_queue)) {
    4976           0 :                         tcp_ofo_queue(sk);
    4977             : 
    4978             :                         /* RFC5681. 4.2. SHOULD send immediate ACK, when
    4979             :                          * gap in queue is filled.
    4980             :                          */
    4981           0 :                         if (RB_EMPTY_ROOT(&tp->out_of_order_queue))
    4982           0 :                                 inet_csk(sk)->icsk_ack.pending |= ICSK_ACK_NOW;
    4983             :                 }
    4984             : 
    4985          15 :                 if (tp->rx_opt.num_sacks)
    4986           0 :                         tcp_sack_remove(tp);
    4987             : 
    4988          15 :                 tcp_fast_path_check(sk);
    4989             : 
    4990          15 :                 if (eaten > 0)
    4991           2 :                         kfree_skb_partial(skb, fragstolen);
    4992          15 :                 if (!sock_flag(sk, SOCK_DEAD))
    4993          15 :                         tcp_data_ready(sk);
    4994          15 :                 return;
    4995             :         }
    4996             : 
    4997           0 :         if (!after(TCP_SKB_CB(skb)->end_seq, tp->rcv_nxt)) {
    4998           0 :                 tcp_rcv_spurious_retrans(sk, skb);
    4999             :                 /* A retransmit, 2nd most common case.  Force an immediate ack. */
    5000           0 :                 NET_INC_STATS(sock_net(sk), LINUX_MIB_DELAYEDACKLOST);
    5001           0 :                 tcp_dsack_set(sk, TCP_SKB_CB(skb)->seq, TCP_SKB_CB(skb)->end_seq);
    5002             : 
    5003           0 : out_of_window:
    5004           0 :                 tcp_enter_quickack_mode(sk, TCP_MAX_QUICKACKS);
    5005           0 :                 inet_csk_schedule_ack(sk);
    5006           0 : drop:
    5007           0 :                 tcp_drop(sk, skb);
    5008           0 :                 return;
    5009             :         }
    5010             : 
    5011             :         /* Out of window. F.e. zero window probe. */
    5012           0 :         if (!before(TCP_SKB_CB(skb)->seq, tp->rcv_nxt + tcp_receive_window(tp)))
    5013           0 :                 goto out_of_window;
    5014             : 
    5015           0 :         if (before(TCP_SKB_CB(skb)->seq, tp->rcv_nxt)) {
    5016             :                 /* Partial packet, seq < rcv_next < end_seq */
    5017           0 :                 tcp_dsack_set(sk, TCP_SKB_CB(skb)->seq, tp->rcv_nxt);
    5018             : 
    5019             :                 /* If window is closed, drop tail of packet. But after
    5020             :                  * remembering D-SACK for its head made in previous line.
    5021             :                  */
    5022           0 :                 if (!tcp_receive_window(tp)) {
    5023           0 :                         NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPZEROWINDOWDROP);
    5024           0 :                         goto out_of_window;
    5025             :                 }
    5026           0 :                 goto queue_and_out;
    5027             :         }
    5028             : 
    5029           0 :         tcp_data_queue_ofo(sk, skb);
    5030             : }
    5031             : 
    5032           0 : static struct sk_buff *tcp_skb_next(struct sk_buff *skb, struct sk_buff_head *list)
    5033             : {
    5034           0 :         if (list)
    5035           0 :                 return !skb_queue_is_last(list, skb) ? skb->next : NULL;
    5036             : 
    5037           0 :         return skb_rb_next(skb);
    5038             : }
    5039             : 
    5040           0 : static struct sk_buff *tcp_collapse_one(struct sock *sk, struct sk_buff *skb,
    5041             :                                         struct sk_buff_head *list,
    5042             :                                         struct rb_root *root)
    5043             : {
    5044           0 :         struct sk_buff *next = tcp_skb_next(skb, list);
    5045             : 
    5046           0 :         if (list)
    5047           0 :                 __skb_unlink(skb, list);
    5048             :         else
    5049           0 :                 rb_erase(&skb->rbnode, root);
    5050             : 
    5051           0 :         __kfree_skb(skb);
    5052           0 :         NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPRCVCOLLAPSED);
    5053             : 
    5054           0 :         return next;
    5055             : }
    5056             : 
    5057             : /* Insert skb into rb tree, ordered by TCP_SKB_CB(skb)->seq */
    5058         364 : void tcp_rbtree_insert(struct rb_root *root, struct sk_buff *skb)
    5059             : {
    5060         364 :         struct rb_node **p = &root->rb_node;
    5061         364 :         struct rb_node *parent = NULL;
    5062         364 :         struct sk_buff *skb1;
    5063             : 
    5064         514 :         while (*p) {
    5065         150 :                 parent = *p;
    5066         150 :                 skb1 = rb_to_skb(parent);
    5067         150 :                 if (before(TCP_SKB_CB(skb)->seq, TCP_SKB_CB(skb1)->seq))
    5068           0 :                         p = &parent->rb_left;
    5069             :                 else
    5070         150 :                         p = &parent->rb_right;
    5071             :         }
    5072         364 :         rb_link_node(&skb->rbnode, parent, p);
    5073         364 :         rb_insert_color(&skb->rbnode, root);
    5074         364 : }
    5075             : 
    5076             : /* Collapse contiguous sequence of skbs head..tail with
    5077             :  * sequence numbers start..end.
    5078             :  *
    5079             :  * If tail is NULL, this means until the end of the queue.
    5080             :  *
    5081             :  * Segments with FIN/SYN are not collapsed (only because this
    5082             :  * simplifies code)
    5083             :  */
    5084             : static void
    5085           0 : tcp_collapse(struct sock *sk, struct sk_buff_head *list, struct rb_root *root,
    5086             :              struct sk_buff *head, struct sk_buff *tail, u32 start, u32 end)
    5087             : {
    5088           0 :         struct sk_buff *skb = head, *n;
    5089           0 :         struct sk_buff_head tmp;
    5090           0 :         bool end_of_skbs;
    5091             : 
    5092             :         /* First, check that queue is collapsible and find
    5093             :          * the point where collapsing can be useful.
    5094             :          */
    5095           0 : restart:
    5096           0 :         for (end_of_skbs = true; skb != NULL && skb != tail; skb = n) {
    5097           0 :                 n = tcp_skb_next(skb, list);
    5098             : 
    5099             :                 /* No new bits? It is possible on ofo queue. */
    5100           0 :                 if (!before(start, TCP_SKB_CB(skb)->end_seq)) {
    5101           0 :                         skb = tcp_collapse_one(sk, skb, list, root);
    5102           0 :                         if (!skb)
    5103             :                                 break;
    5104           0 :                         goto restart;
    5105             :                 }
    5106             : 
    5107             :                 /* The first skb to collapse is:
    5108             :                  * - not SYN/FIN and
    5109             :                  * - bloated or contains data before "start" or
    5110             :                  *   overlaps to the next one and mptcp allow collapsing.
    5111             :                  */
    5112           0 :                 if (!(TCP_SKB_CB(skb)->tcp_flags & (TCPHDR_SYN | TCPHDR_FIN)) &&
    5113           0 :                     (tcp_win_from_space(sk, skb->truesize) > skb->len ||
    5114           0 :                      before(TCP_SKB_CB(skb)->seq, start))) {
    5115             :                         end_of_skbs = false;
    5116             :                         break;
    5117             :                 }
    5118             : 
    5119           0 :                 if (n && n != tail && mptcp_skb_can_collapse(skb, n) &&
    5120           0 :                     TCP_SKB_CB(skb)->end_seq != TCP_SKB_CB(n)->seq) {
    5121             :                         end_of_skbs = false;
    5122             :                         break;
    5123             :                 }
    5124             : 
    5125             :                 /* Decided to skip this, advance start seq. */
    5126             :                 start = TCP_SKB_CB(skb)->end_seq;
    5127             :         }
    5128           0 :         if (end_of_skbs ||
    5129           0 :             (TCP_SKB_CB(skb)->tcp_flags & (TCPHDR_SYN | TCPHDR_FIN)))
    5130           0 :                 return;
    5131             : 
    5132           0 :         __skb_queue_head_init(&tmp);
    5133             : 
    5134           0 :         while (before(start, end)) {
    5135           0 :                 int copy = min_t(int, SKB_MAX_ORDER(0, 0), end - start);
    5136           0 :                 struct sk_buff *nskb;
    5137             : 
    5138           0 :                 nskb = alloc_skb(copy, GFP_ATOMIC);
    5139           0 :                 if (!nskb)
    5140             :                         break;
    5141             : 
    5142           0 :                 memcpy(nskb->cb, skb->cb, sizeof(skb->cb));
    5143             : #ifdef CONFIG_TLS_DEVICE
    5144             :                 nskb->decrypted = skb->decrypted;
    5145             : #endif
    5146           0 :                 TCP_SKB_CB(nskb)->seq = TCP_SKB_CB(nskb)->end_seq = start;
    5147           0 :                 if (list)
    5148           0 :                         __skb_queue_before(list, skb, nskb);
    5149             :                 else
    5150           0 :                         __skb_queue_tail(&tmp, nskb); /* defer rbtree insertion */
    5151           0 :                 skb_set_owner_r(nskb, sk);
    5152           0 :                 mptcp_skb_ext_move(nskb, skb);
    5153             : 
    5154             :                 /* Copy data, releasing collapsed skbs. */
    5155           0 :                 while (copy > 0) {
    5156           0 :                         int offset = start - TCP_SKB_CB(skb)->seq;
    5157           0 :                         int size = TCP_SKB_CB(skb)->end_seq - start;
    5158             : 
    5159           0 :                         BUG_ON(offset < 0);
    5160           0 :                         if (size > 0) {
    5161           0 :                                 size = min(copy, size);
    5162           0 :                                 if (skb_copy_bits(skb, offset, skb_put(nskb, size), size))
    5163           0 :                                         BUG();
    5164           0 :                                 TCP_SKB_CB(nskb)->end_seq += size;
    5165           0 :                                 copy -= size;
    5166           0 :                                 start += size;
    5167             :                         }
    5168           0 :                         if (!before(start, TCP_SKB_CB(skb)->end_seq)) {
    5169           0 :                                 skb = tcp_collapse_one(sk, skb, list, root);
    5170           0 :                                 if (!skb ||
    5171           0 :                                     skb == tail ||
    5172           0 :                                     !mptcp_skb_can_collapse(nskb, skb) ||
    5173           0 :                                     (TCP_SKB_CB(skb)->tcp_flags & (TCPHDR_SYN | TCPHDR_FIN)))
    5174           0 :                                         goto end;
    5175             : #ifdef CONFIG_TLS_DEVICE
    5176             :                                 if (skb->decrypted != nskb->decrypted)
    5177             :                                         goto end;
    5178             : #endif
    5179             :                         }
    5180             :                 }
    5181             :         }
    5182           0 : end:
    5183           0 :         skb_queue_walk_safe(&tmp, skb, n)
    5184           0 :                 tcp_rbtree_insert(root, skb);
    5185             : }
    5186             : 
    5187             : /* Collapse ofo queue. Algorithm: select contiguous sequence of skbs
    5188             :  * and tcp_collapse() them until all the queue is collapsed.
    5189             :  */
    5190           0 : static void tcp_collapse_ofo_queue(struct sock *sk)
    5191             : {
    5192           0 :         struct tcp_sock *tp = tcp_sk(sk);
    5193           0 :         u32 range_truesize, sum_tiny = 0;
    5194           0 :         struct sk_buff *skb, *head;
    5195           0 :         u32 start, end;
    5196             : 
    5197           0 :         skb = skb_rb_first(&tp->out_of_order_queue);
    5198           0 : new_range:
    5199           0 :         if (!skb) {
    5200           0 :                 tp->ooo_last_skb = skb_rb_last(&tp->out_of_order_queue);
    5201           0 :                 return;
    5202             :         }
    5203           0 :         start = TCP_SKB_CB(skb)->seq;
    5204           0 :         end = TCP_SKB_CB(skb)->end_seq;
    5205           0 :         range_truesize = skb->truesize;
    5206             : 
    5207           0 :         for (head = skb;;) {
    5208           0 :                 skb = skb_rb_next(skb);
    5209             : 
    5210             :                 /* Range is terminated when we see a gap or when
    5211             :                  * we are at the queue end.
    5212             :                  */
    5213           0 :                 if (!skb ||
    5214           0 :                     after(TCP_SKB_CB(skb)->seq, end) ||
    5215           0 :                     before(TCP_SKB_CB(skb)->end_seq, start)) {
    5216             :                         /* Do not attempt collapsing tiny skbs */
    5217           0 :                         if (range_truesize != head->truesize ||
    5218           0 :                             end - start >= SKB_WITH_OVERHEAD(SK_MEM_QUANTUM)) {
    5219           0 :                                 tcp_collapse(sk, NULL, &tp->out_of_order_queue,
    5220             :                                              head, skb, start, end);
    5221             :                         } else {
    5222           0 :                                 sum_tiny += range_truesize;
    5223           0 :                                 if (sum_tiny > sk->sk_rcvbuf >> 3)
    5224             :                                         return;
    5225             :                         }
    5226           0 :                         goto new_range;
    5227             :                 }
    5228             : 
    5229           0 :                 range_truesize += skb->truesize;
    5230           0 :                 if (unlikely(before(TCP_SKB_CB(skb)->seq, start)))
    5231           0 :                         start = TCP_SKB_CB(skb)->seq;
    5232           0 :                 if (after(TCP_SKB_CB(skb)->end_seq, end))
    5233           0 :                         end = TCP_SKB_CB(skb)->end_seq;
    5234             :         }
    5235             : }
    5236             : 
    5237             : /*
    5238             :  * Clean the out-of-order queue to make room.
    5239             :  * We drop high sequences packets to :
    5240             :  * 1) Let a chance for holes to be filled.
    5241             :  * 2) not add too big latencies if thousands of packets sit there.
    5242             :  *    (But if application shrinks SO_RCVBUF, we could still end up
    5243             :  *     freeing whole queue here)
    5244             :  * 3) Drop at least 12.5 % of sk_rcvbuf to avoid malicious attacks.
    5245             :  *
    5246             :  * Return true if queue has shrunk.
    5247             :  */
    5248           0 : static bool tcp_prune_ofo_queue(struct sock *sk)
    5249             : {
    5250           0 :         struct tcp_sock *tp = tcp_sk(sk);
    5251           0 :         struct rb_node *node, *prev;
    5252           0 :         int goal;
    5253             : 
    5254           0 :         if (RB_EMPTY_ROOT(&tp->out_of_order_queue))
    5255             :                 return false;
    5256             : 
    5257           0 :         NET_INC_STATS(sock_net(sk), LINUX_MIB_OFOPRUNED);
    5258           0 :         goal = sk->sk_rcvbuf >> 3;
    5259           0 :         node = &tp->ooo_last_skb->rbnode;
    5260           0 :         do {
    5261           0 :                 prev = rb_prev(node);
    5262           0 :                 rb_erase(node, &tp->out_of_order_queue);
    5263           0 :                 goal -= rb_to_skb(node)->truesize;
    5264           0 :                 tcp_drop(sk, rb_to_skb(node));
    5265           0 :                 if (!prev || goal <= 0) {
    5266           0 :                         sk_mem_reclaim(sk);
    5267           0 :                         if (atomic_read(&sk->sk_rmem_alloc) <= sk->sk_rcvbuf &&
    5268           0 :                             !tcp_under_memory_pressure(sk))
    5269             :                                 break;
    5270           0 :                         goal = sk->sk_rcvbuf >> 3;
    5271             :                 }
    5272           0 :                 node = prev;
    5273           0 :         } while (node);
    5274           0 :         tp->ooo_last_skb = rb_to_skb(prev);
    5275             : 
    5276             :         /* Reset SACK state.  A conforming SACK implementation will
    5277             :          * do the same at a timeout based retransmit.  When a connection
    5278             :          * is in a sad state like this, we care only about integrity
    5279             :          * of the connection not performance.
    5280             :          */
    5281           0 :         if (tp->rx_opt.sack_ok)
    5282           0 :                 tcp_sack_reset(&tp->rx_opt);
    5283             :         return true;
    5284             : }
    5285             : 
    5286             : /* Reduce allocated memory if we can, trying to get
    5287             :  * the socket within its memory limits again.
    5288             :  *
    5289             :  * Return less than zero if we should start dropping frames
    5290             :  * until the socket owning process reads some of the data
    5291             :  * to stabilize the situation.
    5292             :  */
    5293           0 : static int tcp_prune_queue(struct sock *sk)
    5294             : {
    5295           0 :         struct tcp_sock *tp = tcp_sk(sk);
    5296             : 
    5297           0 :         NET_INC_STATS(sock_net(sk), LINUX_MIB_PRUNECALLED);
    5298             : 
    5299           0 :         if (atomic_read(&sk->sk_rmem_alloc) >= sk->sk_rcvbuf)
    5300           0 :                 tcp_clamp_window(sk);
    5301           0 :         else if (tcp_under_memory_pressure(sk))
    5302           0 :                 tp->rcv_ssthresh = min(tp->rcv_ssthresh, 4U * tp->advmss);
    5303             : 
    5304           0 :         if (atomic_read(&sk->sk_rmem_alloc) <= sk->sk_rcvbuf)
    5305             :                 return 0;
    5306             : 
    5307           0 :         tcp_collapse_ofo_queue(sk);
    5308           0 :         if (!skb_queue_empty(&sk->sk_receive_queue))
    5309           0 :                 tcp_collapse(sk, &sk->sk_receive_queue, NULL,
    5310           0 :                              skb_peek(&sk->sk_receive_queue),
    5311             :                              NULL,
    5312             :                              tp->copied_seq, tp->rcv_nxt);
    5313           0 :         sk_mem_reclaim(sk);
    5314             : 
    5315           0 :         if (atomic_read(&sk->sk_rmem_alloc) <= sk->sk_rcvbuf)
    5316             :                 return 0;
    5317             : 
    5318             :         /* Collapsing did not help, destructive actions follow.
    5319             :          * This must not ever occur. */
    5320             : 
    5321           0 :         tcp_prune_ofo_queue(sk);
    5322             : 
    5323           0 :         if (atomic_read(&sk->sk_rmem_alloc) <= sk->sk_rcvbuf)
    5324             :                 return 0;
    5325             : 
    5326             :         /* If we are really being abused, tell the caller to silently
    5327             :          * drop receive data on the floor.  It will get retransmitted
    5328             :          * and hopefully then we'll have sufficient space.
    5329             :          */
    5330           0 :         NET_INC_STATS(sock_net(sk), LINUX_MIB_RCVPRUNED);
    5331             : 
    5332             :         /* Massive buffer overcommit. */
    5333           0 :         tp->pred_flags = 0;
    5334           0 :         return -1;
    5335             : }
    5336             : 
    5337           0 : static bool tcp_should_expand_sndbuf(const struct sock *sk)
    5338             : {
    5339           0 :         const struct tcp_sock *tp = tcp_sk(sk);
    5340             : 
    5341             :         /* If the user specified a specific send buffer setting, do
    5342             :          * not modify it.
    5343             :          */
    5344           0 :         if (sk->sk_userlocks & SOCK_SNDBUF_LOCK)
    5345             :                 return false;
    5346             : 
    5347             :         /* If we are under global TCP memory pressure, do not expand.  */
    5348           0 :         if (tcp_under_memory_pressure(sk))
    5349             :                 return false;
    5350             : 
    5351             :         /* If we are under soft global TCP memory pressure, do not expand.  */
    5352           0 :         if (sk_memory_allocated(sk) >= sk_prot_mem_limits(sk, 0))
    5353             :                 return false;
    5354             : 
    5355             :         /* If we filled the congestion window, do not expand.  */
    5356           0 :         if (tcp_packets_in_flight(tp) >= tp->snd_cwnd)
    5357           0 :                 return false;
    5358             : 
    5359             :         return true;
    5360             : }
    5361             : 
    5362           0 : static void tcp_new_space(struct sock *sk)
    5363             : {
    5364           0 :         struct tcp_sock *tp = tcp_sk(sk);
    5365             : 
    5366           0 :         if (tcp_should_expand_sndbuf(sk)) {
    5367           0 :                 tcp_sndbuf_expand(sk);
    5368           0 :                 tp->snd_cwnd_stamp = tcp_jiffies32;
    5369             :         }
    5370             : 
    5371           0 :         sk->sk_write_space(sk);
    5372           0 : }
    5373             : 
    5374         367 : static void tcp_check_space(struct sock *sk)
    5375             : {
    5376             :         /* pairs with tcp_poll() */
    5377         367 :         smp_mb();
    5378         367 :         if (sk->sk_socket &&
    5379         359 :             test_bit(SOCK_NOSPACE, &sk->sk_socket->flags)) {
    5380           0 :                 tcp_new_space(sk);
    5381           0 :                 if (!test_bit(SOCK_NOSPACE, &sk->sk_socket->flags))
    5382           0 :                         tcp_chrono_stop(sk, TCP_CHRONO_SNDBUF_LIMITED);
    5383             :         }
    5384         367 : }
    5385             : 
    5386         367 : static inline void tcp_data_snd_check(struct sock *sk)
    5387             : {
    5388         367 :         tcp_push_pending_frames(sk);
    5389         367 :         tcp_check_space(sk);
    5390             : }
    5391             : 
    5392             : /*
    5393             :  * Check if sending an ack is needed.
    5394             :  */
    5395          70 : static void __tcp_ack_snd_check(struct sock *sk, int ofo_possible)
    5396             : {
    5397          70 :         struct tcp_sock *tp = tcp_sk(sk);
    5398          70 :         unsigned long rtt, delay;
    5399             : 
    5400             :             /* More than one full frame received... */
    5401          70 :         if (((tp->rcv_nxt - tp->rcv_wup) > inet_csk(sk)->icsk_ack.rcv_mss &&
    5402             :              /* ... and right edge of window advances far enough.
    5403             :               * (tcp_recvmsg() will send ACK otherwise).
    5404             :               * If application uses SO_RCVLOWAT, we want send ack now if
    5405             :               * we have not received enough bytes to satisfy the condition.
    5406             :               */
    5407          15 :             (tp->rcv_nxt - tp->copied_seq < sk->sk_rcvlowat ||
    5408          73 :              __tcp_select_window(sk) >= tp->rcv_wnd)) ||
    5409             :             /* We ACK each frame or... */
    5410          58 :             tcp_in_quickack_mode(sk) ||
    5411             :             /* Protocol state mandates a one-time immediate ACK */
    5412          14 :             inet_csk(sk)->icsk_ack.pending & ICSK_ACK_NOW) {
    5413          56 : send_now:
    5414          56 :                 tcp_send_ack(sk);
    5415          56 :                 return;
    5416             :         }
    5417             : 
    5418          14 :         if (!ofo_possible || RB_EMPTY_ROOT(&tp->out_of_order_queue)) {
    5419          14 :                 tcp_send_delayed_ack(sk);
    5420          14 :                 return;
    5421             :         }
    5422             : 
    5423           0 :         if (!tcp_is_sack(tp) ||
    5424           0 :             tp->compressed_ack >= sock_net(sk)->ipv4.sysctl_tcp_comp_sack_nr)
    5425           0 :                 goto send_now;
    5426             : 
    5427           0 :         if (tp->compressed_ack_rcv_nxt != tp->rcv_nxt) {
    5428           0 :                 tp->compressed_ack_rcv_nxt = tp->rcv_nxt;
    5429           0 :                 tp->dup_ack_counter = 0;
    5430             :         }
    5431           0 :         if (tp->dup_ack_counter < TCP_FASTRETRANS_THRESH) {
    5432           0 :                 tp->dup_ack_counter++;
    5433           0 :                 goto send_now;
    5434             :         }
    5435           0 :         tp->compressed_ack++;
    5436           0 :         if (hrtimer_is_queued(&tp->compressed_ack_timer))
    5437             :                 return;
    5438             : 
    5439             :         /* compress ack timer : 5 % of rtt, but no more than tcp_comp_sack_delay_ns */
    5440             : 
    5441           0 :         rtt = tp->rcv_rtt_est.rtt_us;
    5442           0 :         if (tp->srtt_us && tp->srtt_us < rtt)
    5443           0 :                 rtt = tp->srtt_us;
    5444             : 
    5445           0 :         delay = min_t(unsigned long, sock_net(sk)->ipv4.sysctl_tcp_comp_sack_delay_ns,
    5446             :                       rtt * (NSEC_PER_USEC >> 3)/20);
    5447           0 :         sock_hold(sk);
    5448           0 :         hrtimer_start_range_ns(&tp->compressed_ack_timer, ns_to_ktime(delay),
    5449           0 :                                sock_net(sk)->ipv4.sysctl_tcp_comp_sack_slack_ns,
    5450             :                                HRTIMER_MODE_REL_PINNED_SOFT);
    5451             : }
    5452             : 
    5453          19 : static inline void tcp_ack_snd_check(struct sock *sk)
    5454             : {
    5455          19 :         if (!inet_csk_ack_scheduled(sk)) {
    5456             :                 /* We sent a data segment already. */
    5457             :                 return;
    5458             :         }
    5459          15 :         __tcp_ack_snd_check(sk, 1);
    5460             : }
    5461             : 
    5462             : /*
    5463             :  *      This routine is only called when we have urgent data
    5464             :  *      signaled. Its the 'slow' part of tcp_urg. It could be
    5465             :  *      moved inline now as tcp_urg is only called from one
    5466             :  *      place. We handle URGent data wrong. We have to - as
    5467             :  *      BSD still doesn't use the correction from RFC961.
    5468             :  *      For 1003.1g we should support a new option TCP_STDURG to permit
    5469             :  *      either form (or just set the sysctl tcp_stdurg).
    5470             :  */
    5471             : 
    5472           0 : static void tcp_check_urg(struct sock *sk, const struct tcphdr *th)
    5473             : {
    5474           0 :         struct tcp_sock *tp = tcp_sk(sk);
    5475           0 :         u32 ptr = ntohs(th->urg_ptr);
    5476             : 
    5477           0 :         if (ptr && !sock_net(sk)->ipv4.sysctl_tcp_stdurg)
    5478           0 :                 ptr--;
    5479           0 :         ptr += ntohl(th->seq);
    5480             : 
    5481             :         /* Ignore urgent data that we've already seen and read. */
    5482           0 :         if (after(tp->copied_seq, ptr))
    5483             :                 return;
    5484             : 
    5485             :         /* Do not replay urg ptr.
    5486             :          *
    5487             :          * NOTE: interesting situation not covered by specs.
    5488             :          * Misbehaving sender may send urg ptr, pointing to segment,
    5489             :          * which we already have in ofo queue. We are not able to fetch
    5490             :          * such data and will stay in TCP_URG_NOTYET until will be eaten
    5491             :          * by recvmsg(). Seems, we are not obliged to handle such wicked
    5492             :          * situations. But it is worth to think about possibility of some
    5493             :          * DoSes using some hypothetical application level deadlock.
    5494             :          */
    5495           0 :         if (before(ptr, tp->rcv_nxt))
    5496             :                 return;
    5497             : 
    5498             :         /* Do we already have a newer (or duplicate) urgent pointer? */
    5499           0 :         if (tp->urg_data && !after(ptr, tp->urg_seq))
    5500             :                 return;
    5501             : 
    5502             :         /* Tell the world about our new urgent pointer. */
    5503           0 :         sk_send_sigurg(sk);
    5504             : 
    5505             :         /* We may be adding urgent data when the last byte read was
    5506             :          * urgent. To do this requires some care. We cannot just ignore
    5507             :          * tp->copied_seq since we would read the last urgent byte again
    5508             :          * as data, nor can we alter copied_seq until this data arrives
    5509             :          * or we break the semantics of SIOCATMARK (and thus sockatmark())
    5510             :          *
    5511             :          * NOTE. Double Dutch. Rendering to plain English: author of comment
    5512             :          * above did something sort of  send("A", MSG_OOB); send("B", MSG_OOB);
    5513             :          * and expect that both A and B disappear from stream. This is _wrong_.
    5514             :          * Though this happens in BSD with high probability, this is occasional.
    5515             :          * Any application relying on this is buggy. Note also, that fix "works"
    5516             :          * only in this artificial test. Insert some normal data between A and B and we will
    5517             :          * decline of BSD again. Verdict: it is better to remove to trap
    5518             :          * buggy users.
    5519             :          */
    5520           0 :         if (tp->urg_seq == tp->copied_seq && tp->urg_data &&
    5521           0 :             !sock_flag(sk, SOCK_URGINLINE) && tp->copied_seq != tp->rcv_nxt) {
    5522           0 :                 struct sk_buff *skb = skb_peek(&sk->sk_receive_queue);
    5523           0 :                 tp->copied_seq++;
    5524           0 :                 if (skb && !before(tp->copied_seq, TCP_SKB_CB(skb)->end_seq)) {
    5525           0 :                         __skb_unlink(skb, &sk->sk_receive_queue);
    5526           0 :                         __kfree_skb(skb);
    5527             :                 }
    5528             :         }
    5529             : 
    5530           0 :         tp->urg_data = TCP_URG_NOTYET;
    5531           0 :         WRITE_ONCE(tp->urg_seq, ptr);
    5532             : 
    5533             :         /* Disable header prediction. */
    5534           0 :         tp->pred_flags = 0;
    5535             : }
    5536             : 
    5537             : /* This is the 'fast' part of urgent handling. */
    5538          19 : static void tcp_urg(struct sock *sk, struct sk_buff *skb, const struct tcphdr *th)
    5539             : {
    5540          19 :         struct tcp_sock *tp = tcp_sk(sk);
    5541             : 
    5542             :         /* Check if we get a new urgent pointer - normally not. */
    5543          19 :         if (th->urg)
    5544           0 :                 tcp_check_urg(sk, th);
    5545             : 
    5546             :         /* Do we wait for any urgent data? - normally not... */
    5547          19 :         if (tp->urg_data == TCP_URG_NOTYET) {
    5548           0 :                 u32 ptr = tp->urg_seq - ntohl(th->seq) + (th->doff * 4) -
    5549           0 :                           th->syn;
    5550             : 
    5551             :                 /* Is the urgent pointer pointing into this packet? */
    5552           0 :                 if (ptr < skb->len) {
    5553           0 :                         u8 tmp;
    5554           0 :                         if (skb_copy_bits(skb, ptr, &tmp, 1))
    5555           0 :                                 BUG();
    5556           0 :                         tp->urg_data = TCP_URG_VALID | tmp;
    5557           0 :                         if (!sock_flag(sk, SOCK_DEAD))
    5558           0 :                                 sk->sk_data_ready(sk);
    5559             :                 }
    5560             :         }
    5561          19 : }
    5562             : 
    5563             : /* Accept RST for rcv_nxt - 1 after a FIN.
    5564             :  * When tcp connections are abruptly terminated from Mac OSX (via ^C), a
    5565             :  * FIN is sent followed by a RST packet. The RST is sent with the same
    5566             :  * sequence number as the FIN, and thus according to RFC 5961 a challenge
    5567             :  * ACK should be sent. However, Mac OSX rate limits replies to challenge
    5568             :  * ACKs on the closed socket. In addition middleboxes can drop either the
    5569             :  * challenge ACK or a subsequent RST.
    5570             :  */
    5571           0 : static bool tcp_reset_check(const struct sock *sk, const struct sk_buff *skb)
    5572             : {
    5573           0 :         struct tcp_sock *tp = tcp_sk(sk);
    5574             : 
    5575           0 :         return unlikely(TCP_SKB_CB(skb)->seq == (tp->rcv_nxt - 1) &&
    5576             :                         (1 << sk->sk_state) & (TCPF_CLOSE_WAIT | TCPF_LAST_ACK |
    5577             :                                                TCPF_CLOSING));
    5578             : }
    5579             : 
    5580             : /* Does PAWS and seqno based validation of an incoming segment, flags will
    5581             :  * play significant role here.
    5582             :  */
    5583          10 : static bool tcp_validate_incoming(struct sock *sk, struct sk_buff *skb,
    5584             :                                   const struct tcphdr *th, int syn_inerr)
    5585             : {
    5586          10 :         struct tcp_sock *tp = tcp_sk(sk);
    5587          10 :         bool rst_seq_match = false;
    5588             : 
    5589             :         /* RFC1323: H1. Apply PAWS check first. */
    5590          10 :         if (tcp_fast_parse_options(sock_net(sk), skb, th, tp) &&
    5591           0 :             tp->rx_opt.saw_tstamp &&
    5592           0 :             tcp_paws_discard(sk, skb)) {
    5593           0 :                 if (!th->rst) {
    5594           0 :                         NET_INC_STATS(sock_net(sk), LINUX_MIB_PAWSESTABREJECTED);
    5595           0 :                         if (!tcp_oow_rate_limited(sock_net(sk), skb,
    5596             :                                                   LINUX_MIB_TCPACKSKIPPEDPAWS,
    5597             :                                                   &tp->last_oow_ack_time))
    5598           0 :                                 tcp_send_dupack(sk, skb);
    5599           0 :                         goto discard;
    5600             :                 }
    5601             :                 /* Reset is accepted even if it did not pass PAWS. */
    5602             :         }
    5603             : 
    5604             :         /* Step 1: check sequence number */
    5605          10 :         if (!tcp_sequence(tp, TCP_SKB_CB(skb)->seq, TCP_SKB_CB(skb)->end_seq)) {
    5606             :                 /* RFC793, page 37: "In all states except SYN-SENT, all reset
    5607             :                  * (RST) segments are validated by checking their SEQ-fields."
    5608             :                  * And page 69: "If an incoming segment is not acceptable,
    5609             :                  * an acknowledgment should be sent in reply (unless the RST
    5610             :                  * bit is set, if so drop the segment and return)".
    5611             :                  */
    5612           0 :                 if (!th->rst) {
    5613           0 :                         if (th->syn)
    5614           0 :                                 goto syn_challenge;
    5615           0 :                         if (!tcp_oow_rate_limited(sock_net(sk), skb,
    5616             :                                                   LINUX_MIB_TCPACKSKIPPEDSEQ,
    5617             :                                                   &tp->last_oow_ack_time))
    5618           0 :                                 tcp_send_dupack(sk, skb);
    5619           0 :                 } else if (tcp_reset_check(sk, skb)) {
    5620           0 :                         tcp_reset(sk, skb);
    5621             :                 }
    5622           0 :                 goto discard;
    5623             :         }
    5624             : 
    5625             :         /* Step 2: check RST bit */
    5626          10 :         if (th->rst) {
    5627             :                 /* RFC 5961 3.2 (extend to match against (RCV.NXT - 1) after a
    5628             :                  * FIN and SACK too if available):
    5629             :                  * If seq num matches RCV.NXT or (RCV.NXT - 1) after a FIN, or
    5630             :                  * the right-most SACK block,
    5631             :                  * then
    5632             :                  *     RESET the connection
    5633             :                  * else
    5634             :                  *     Send a challenge ACK
    5635             :                  */
    5636           0 :                 if (TCP_SKB_CB(skb)->seq == tp->rcv_nxt ||
    5637           0 :                     tcp_reset_check(sk, skb)) {
    5638             :                         rst_seq_match = true;
    5639           0 :                 } else if (tcp_is_sack(tp) && tp->rx_opt.num_sacks > 0) {
    5640           0 :                         struct tcp_sack_block *sp = &tp->selective_acks[0];
    5641           0 :                         int max_sack = sp[0].end_seq;
    5642           0 :                         int this_sack;
    5643             : 
    5644           0 :                         for (this_sack = 1; this_sack < tp->rx_opt.num_sacks;
    5645           0 :                              ++this_sack) {
    5646           0 :                                 max_sack = after(sp[this_sack].end_seq,
    5647             :                                                  max_sack) ?
    5648           0 :                                         sp[this_sack].end_seq : max_sack;
    5649             :                         }
    5650             : 
    5651           0 :                         if (TCP_SKB_CB(skb)->seq == max_sack)
    5652             :                                 rst_seq_match = true;
    5653             :                 }
    5654             : 
    5655             :                 if (rst_seq_match)
    5656           0 :                         tcp_reset(sk, skb);
    5657             :                 else {
    5658             :                         /* Disable TFO if RST is out-of-order
    5659             :                          * and no data has been received
    5660             :                          * for current active TFO socket
    5661             :                          */
    5662           0 :                         if (tp->syn_fastopen && !tp->data_segs_in &&
    5663           0 :                             sk->sk_state == TCP_ESTABLISHED)
    5664           0 :                                 tcp_fastopen_active_disable(sk);
    5665           0 :                         tcp_send_challenge_ack(sk, skb);
    5666             :                 }
    5667           0 :                 goto discard;
    5668             :         }
    5669             : 
    5670             :         /* step 3: check security and precedence [ignored] */
    5671             : 
    5672             :         /* step 4: Check for a SYN
    5673             :          * RFC 5961 4.2 : Send a challenge ack
    5674             :          */
    5675          10 :         if (th->syn) {
    5676           0 : syn_challenge:
    5677           0 :                 if (syn_inerr)
    5678           0 :                         TCP_INC_STATS(sock_net(sk), TCP_MIB_INERRS);
    5679           0 :                 NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPSYNCHALLENGE);
    5680           0 :                 tcp_send_challenge_ack(sk, skb);
    5681           0 :                 goto discard;
    5682             :         }
    5683             : 
    5684          10 :         bpf_skops_parse_hdr(sk, skb);
    5685             : 
    5686             :         return true;
    5687             : 
    5688           0 : discard:
    5689           0 :         tcp_drop(sk, skb);
    5690           0 :         return false;
    5691             : }
    5692             : 
    5693             : /*
    5694             :  *      TCP receive function for the ESTABLISHED state.
    5695             :  *
    5696             :  *      It is split into a fast path and a slow path. The fast path is
    5697             :  *      disabled when:
    5698             :  *      - A zero window was announced from us - zero window probing
    5699             :  *        is only handled properly in the slow path.
    5700             :  *      - Out of order segments arrived.
    5701             :  *      - Urgent data is expected.
    5702             :  *      - There is no buffer space left
    5703             :  *      - Unexpected TCP flags/window values/header lengths are received
    5704             :  *        (detected by checking the TCP header against pred_flags)
    5705             :  *      - Data is sent in both directions. Fast path only supports pure senders
    5706             :  *        or pure receivers (this means either the sequence number or the ack
    5707             :  *        value must stay constant)
    5708             :  *      - Unexpected TCP option.
    5709             :  *
    5710             :  *      When these conditions are not satisfied it drops into a standard
    5711             :  *      receive procedure patterned after RFC793 to handle all cases.
    5712             :  *      The first three cases are guaranteed by proper pred_flags setting,
    5713             :  *      the rest is checked inline. Fast processing is turned on in
    5714             :  *      tcp_data_queue when everything is OK.
    5715             :  */
    5716         412 : void tcp_rcv_established(struct sock *sk, struct sk_buff *skb)
    5717             : {
    5718         412 :         const struct tcphdr *th = (const struct tcphdr *)skb->data;
    5719         412 :         struct tcp_sock *tp = tcp_sk(sk);
    5720         412 :         unsigned int len = skb->len;
    5721             : 
    5722             :         /* TCP congestion window tracking */
    5723         412 :         trace_tcp_probe(sk, skb);
    5724             : 
    5725         412 :         tcp_mstamp_refresh(tp);
    5726         412 :         if (unlikely(!sk->sk_rx_dst))
    5727           0 :                 inet_csk(sk)->icsk_af_ops->sk_rx_dst_set(sk, skb);
    5728             :         /*
    5729             :          *      Header prediction.
    5730             :          *      The code loosely follows the one in the famous
    5731             :          *      "30 instruction TCP receive" Van Jacobson mail.
    5732             :          *
    5733             :          *      Van's trick is to deposit buffers into socket queue
    5734             :          *      on a device interrupt, to call tcp_recv function
    5735             :          *      on the receive process context and checksum and copy
    5736             :          *      the buffer to user space. smart...
    5737             :          *
    5738             :          *      Our current scheme is not silly either but we take the
    5739             :          *      extra cost of the net_bh soft interrupt processing...
    5740             :          *      We do checksum and copy also but from device to kernel.
    5741             :          */
    5742             : 
    5743         412 :         tp->rx_opt.saw_tstamp = 0;
    5744             : 
    5745             :         /*      pred_flags is 0xS?10 << 16 + snd_wnd
    5746             :          *      if header_prediction is to be made
    5747             :          *      'S' will always be tp->tcp_header_len >> 2
    5748             :          *      '?' will be 0 for the fast path, otherwise pred_flags is 0 to
    5749             :          *  turn it off (when there are holes in the receive
    5750             :          *       space for instance)
    5751             :          *      PSH flag is ignored.
    5752             :          */
    5753             : 
    5754         412 :         if ((tcp_flag_word(th) & TCP_HP_BITS) == tp->pred_flags &&
    5755         409 :             TCP_SKB_CB(skb)->seq == tp->rcv_nxt &&
    5756         409 :             !after(TCP_SKB_CB(skb)->ack_seq, tp->snd_nxt)) {
    5757         409 :                 int tcp_header_len = tp->tcp_header_len;
    5758             : 
    5759             :                 /* Timestamp header prediction: tcp_header_len
    5760             :                  * is automatically equal to th->doff*4 due to pred_flags
    5761             :                  * match.
    5762             :                  */
    5763             : 
    5764             :                 /* Check timestamp */
    5765         409 :                 if (tcp_header_len == sizeof(struct tcphdr) + TCPOLEN_TSTAMP_ALIGNED) {
    5766             :                         /* No? Slow path! */
    5767           0 :                         if (!tcp_parse_aligned_timestamp(tp, th))
    5768           0 :                                 goto slow_path;
    5769             : 
    5770             :                         /* If PAWS failed, check it more carefully in slow path */
    5771           0 :                         if ((s32)(tp->rx_opt.rcv_tsval - tp->rx_opt.ts_recent) < 0)
    5772           0 :                                 goto slow_path;
    5773             : 
    5774             :                         /* DO NOT update ts_recent here, if checksum fails
    5775             :                          * and timestamp was corrupted part, it will result
    5776             :                          * in a hung connection since we will drop all
    5777             :                          * future packets due to the PAWS test.
    5778             :                          */
    5779             :                 }
    5780             : 
    5781         409 :                 if (len <= tcp_header_len) {
    5782             :                         /* Bulk data transfer: sender */
    5783         342 :                         if (len == tcp_header_len) {
    5784             :                                 /* Predicted packet is in window by definition.
    5785             :                                  * seq == rcv_nxt and rcv_wup <= rcv_nxt.
    5786             :                                  * Hence, check seq<=rcv_wup reduces to:
    5787             :                                  */
    5788         342 :                                 if (tcp_header_len ==
    5789           0 :                                     (sizeof(struct tcphdr) + TCPOLEN_TSTAMP_ALIGNED) &&
    5790           0 :                                     tp->rcv_nxt == tp->rcv_wup)
    5791           0 :                                         tcp_store_ts_recent(tp);
    5792             : 
    5793             :                                 /* We know that such packets are checksummed
    5794             :                                  * on entry.
    5795             :                                  */
    5796         342 :                                 tcp_ack(sk, skb, 0);
    5797         342 :                                 __kfree_skb(skb);
    5798         342 :                                 tcp_data_snd_check(sk);
    5799             :                                 /* When receiving pure ack in fast path, update
    5800             :                                  * last ts ecr directly instead of calling
    5801             :                                  * tcp_rcv_rtt_measure_ts()
    5802             :                                  */
    5803         342 :                                 tp->rcv_rtt_last_tsecr = tp->rx_opt.rcv_tsecr;
    5804         342 :                                 return;
    5805             :                         } else { /* Header too small */
    5806           0 :                                 TCP_INC_STATS(sock_net(sk), TCP_MIB_INERRS);
    5807           0 :                                 goto discard;
    5808             :                         }
    5809             :                 } else {
    5810          67 :                         int eaten = 0;
    5811          67 :                         bool fragstolen = false;
    5812             : 
    5813          67 :                         if (tcp_checksum_complete(skb))
    5814           0 :                                 goto csum_error;
    5815             : 
    5816          67 :                         if ((int)skb->truesize > sk->sk_forward_alloc)
    5817          12 :                                 goto step5;
    5818             : 
    5819             :                         /* Predicted packet is in window by definition.
    5820             :                          * seq == rcv_nxt and rcv_wup <= rcv_nxt.
    5821             :                          * Hence, check seq<=rcv_wup reduces to:
    5822             :                          */
    5823          55 :                         if (tcp_header_len ==
    5824           0 :                             (sizeof(struct tcphdr) + TCPOLEN_TSTAMP_ALIGNED) &&
    5825           0 :                             tp->rcv_nxt == tp->rcv_wup)
    5826           0 :                                 tcp_store_ts_recent(tp);
    5827             : 
    5828          55 :                         tcp_rcv_rtt_measure_ts(sk, skb);
    5829             : 
    5830          55 :                         NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPHPHITS);
    5831             : 
    5832             :                         /* Bulk data transfer: receiver */
    5833          55 :                         __skb_pull(skb, tcp_header_len);
    5834          55 :                         eaten = tcp_queue_rcv(sk, skb, &fragstolen);
    5835             : 
    5836          55 :                         tcp_event_data_recv(sk, skb);
    5837             : 
    5838          55 :                         if (TCP_SKB_CB(skb)->ack_seq != tp->snd_una) {
    5839             :                                 /* Well, only one small jumplet in fast path... */
    5840           6 :                                 tcp_ack(sk, skb, FLAG_DATA);
    5841           6 :                                 tcp_data_snd_check(sk);
    5842           6 :                                 if (!inet_csk_ack_scheduled(sk))
    5843           0 :                                         goto no_ack;
    5844             :                         } else {
    5845          49 :                                 tcp_update_wl(tp, TCP_SKB_CB(skb)->seq);
    5846             :                         }
    5847             : 
    5848          55 :                         __tcp_ack_snd_check(sk, 0);
    5849          55 : no_ack:
    5850          55 :                         if (eaten)
    5851           0 :                                 kfree_skb_partial(skb, fragstolen);
    5852          55 :                         tcp_data_ready(sk);
    5853          55 :                         return;
    5854             :                 }
    5855             :         }
    5856             : 
    5857           3 : slow_path:
    5858           3 :         if (len < (th->doff << 2) || tcp_checksum_complete(skb))
    5859           0 :                 goto csum_error;
    5860             : 
    5861           3 :         if (!th->ack && !th->rst && !th->syn)
    5862           0 :                 goto discard;
    5863             : 
    5864             :         /*
    5865             :          *      Standard slow path.
    5866             :          */
    5867             : 
    5868           3 :         if (!tcp_validate_incoming(sk, skb, th, 1))
    5869             :                 return;
    5870             : 
    5871           3 : step5:
    5872          15 :         if (tcp_ack(sk, skb, FLAG_SLOWPATH | FLAG_UPDATE_TS_RECENT) < 0)
    5873           0 :                 goto discard;
    5874             : 
    5875          15 :         tcp_rcv_rtt_measure_ts(sk, skb);
    5876             : 
    5877             :         /* Process urgent data. */
    5878          15 :         tcp_urg(sk, skb, th);
    5879             : 
    5880             :         /* step 7: process the segment text */
    5881          15 :         tcp_data_queue(sk, skb);
    5882             : 
    5883          15 :         tcp_data_snd_check(sk);
    5884          15 :         tcp_ack_snd_check(sk);
    5885          15 :         return;
    5886             : 
    5887           0 : csum_error:
    5888           0 :         TCP_INC_STATS(sock_net(sk), TCP_MIB_CSUMERRORS);
    5889           0 :         TCP_INC_STATS(sock_net(sk), TCP_MIB_INERRS);
    5890             : 
    5891           0 : discard:
    5892           0 :         tcp_drop(sk, skb);
    5893             : }
    5894             : EXPORT_SYMBOL(tcp_rcv_established);
    5895             : 
    5896           4 : void tcp_init_transfer(struct sock *sk, int bpf_op, struct sk_buff *skb)
    5897             : {
    5898           4 :         struct inet_connection_sock *icsk = inet_csk(sk);
    5899           4 :         struct tcp_sock *tp = tcp_sk(sk);
    5900             : 
    5901           4 :         tcp_mtup_init(sk);
    5902           4 :         icsk->icsk_af_ops->rebuild_header(sk);
    5903           4 :         tcp_init_metrics(sk);
    5904             : 
    5905             :         /* Initialize the congestion window to start the transfer.
    5906             :          * Cut cwnd down to 1 per RFC5681 if SYN or SYN-ACK has been
    5907             :          * retransmitted. In light of RFC6298 more aggressive 1sec
    5908             :          * initRTO, we only reset cwnd when more than 1 SYN/SYN-ACK
    5909             :          * retransmission has occurred.
    5910             :          */
    5911           4 :         if (tp->total_retrans > 1 && tp->undo_marker)
    5912           0 :                 tp->snd_cwnd = 1;
    5913             :         else
    5914           4 :                 tp->snd_cwnd = tcp_init_cwnd(tp, __sk_dst_get(sk));
    5915           4 :         tp->snd_cwnd_stamp = tcp_jiffies32;
    5916             : 
    5917           4 :         icsk->icsk_ca_initialized = 0;
    5918           4 :         bpf_skops_established(sk, bpf_op, skb);
    5919           4 :         if (!icsk->icsk_ca_initialized)
    5920           4 :                 tcp_init_congestion_control(sk);
    5921           4 :         tcp_init_buffer_space(sk);
    5922           4 : }
    5923             : 
    5924           0 : void tcp_finish_connect(struct sock *sk, struct sk_buff *skb)
    5925             : {
    5926           0 :         struct tcp_sock *tp = tcp_sk(sk);
    5927           0 :         struct inet_connection_sock *icsk = inet_csk(sk);
    5928             : 
    5929           0 :         tcp_set_state(sk, TCP_ESTABLISHED);
    5930           0 :         icsk->icsk_ack.lrcvtime = tcp_jiffies32;
    5931             : 
    5932           0 :         if (skb) {
    5933           0 :                 icsk->icsk_af_ops->sk_rx_dst_set(sk, skb);
    5934           0 :                 security_inet_conn_established(sk, skb);
    5935           0 :                 sk_mark_napi_id(sk, skb);
    5936             :         }
    5937             : 
    5938           0 :         tcp_init_transfer(sk, BPF_SOCK_OPS_ACTIVE_ESTABLISHED_CB, skb);
    5939             : 
    5940             :         /* Prevent spurious tcp_cwnd_restart() on first data
    5941             :          * packet.
    5942             :          */
    5943           0 :         tp->lsndtime = tcp_jiffies32;
    5944             : 
    5945           0 :         if (sock_flag(sk, SOCK_KEEPOPEN))
    5946           0 :                 inet_csk_reset_keepalive_timer(sk, keepalive_time_when(tp));
    5947             : 
    5948           0 :         if (!tp->rx_opt.snd_wscale)
    5949           0 :                 __tcp_fast_path_on(tp, tp->snd_wnd);
    5950             :         else
    5951           0 :                 tp->pred_flags = 0;
    5952           0 : }
    5953             : 
    5954           0 : static bool tcp_rcv_fastopen_synack(struct sock *sk, struct sk_buff *synack,
    5955             :                                     struct tcp_fastopen_cookie *cookie)
    5956             : {
    5957           0 :         struct tcp_sock *tp = tcp_sk(sk);
    5958           0 :         struct sk_buff *data = tp->syn_data ? tcp_rtx_queue_head(sk) : NULL;
    5959           0 :         u16 mss = tp->rx_opt.mss_clamp, try_exp = 0;
    5960           0 :         bool syn_drop = false;
    5961             : 
    5962           0 :         if (mss == tp->rx_opt.user_mss) {
    5963           0 :                 struct tcp_options_received opt;
    5964             : 
    5965             :                 /* Get original SYNACK MSS value if user MSS sets mss_clamp */
    5966           0 :                 tcp_clear_options(&opt);
    5967           0 :                 opt.user_mss = opt.mss_clamp = 0;
    5968           0 :                 tcp_parse_options(sock_net(sk), synack, &opt, 0, NULL);
    5969           0 :                 mss = opt.mss_clamp;
    5970             :         }
    5971             : 
    5972           0 :         if (!tp->syn_fastopen) {
    5973             :                 /* Ignore an unsolicited cookie */
    5974           0 :                 cookie->len = -1;
    5975           0 :         } else if (tp->total_retrans) {
    5976             :                 /* SYN timed out and the SYN-ACK neither has a cookie nor
    5977             :                  * acknowledges data. Presumably the remote received only
    5978             :                  * the retransmitted (regular) SYNs: either the original
    5979             :                  * SYN-data or the corresponding SYN-ACK was dropped.
    5980             :                  */
    5981           0 :                 syn_drop = (cookie->len < 0 && data);
    5982           0 :         } else if (cookie->len < 0 && !tp->syn_data) {
    5983             :                 /* We requested a cookie but didn't get it. If we did not use
    5984             :                  * the (old) exp opt format then try so next time (try_exp=1).
    5985             :                  * Otherwise we go back to use the RFC7413 opt (try_exp=2).
    5986             :                  */
    5987           0 :                 try_exp = tp->syn_fastopen_exp ? 2 : 1;
    5988             :         }
    5989             : 
    5990           0 :         tcp_fastopen_cache_set(sk, mss, cookie, syn_drop, try_exp);
    5991             : 
    5992           0 :         if (data) { /* Retransmit unacked data in SYN */
    5993           0 :                 if (tp->total_retrans)
    5994           0 :                         tp->fastopen_client_fail = TFO_SYN_RETRANSMITTED;
    5995             :                 else
    5996           0 :                         tp->fastopen_client_fail = TFO_DATA_NOT_ACKED;
    5997           0 :                 skb_rbtree_walk_from(data) {
    5998           0 :                         if (__tcp_retransmit_skb(sk, data, 1))
    5999             :                                 break;
    6000             :                 }
    6001           0 :                 tcp_rearm_rto(sk);
    6002           0 :                 NET_INC_STATS(sock_net(sk),
    6003             :                                 LINUX_MIB_TCPFASTOPENACTIVEFAIL);
    6004           0 :                 return true;
    6005             :         }
    6006           0 :         tp->syn_data_acked = tp->syn_data;
    6007           0 :         if (tp->syn_data_acked) {
    6008           0 :                 NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPFASTOPENACTIVE);
    6009             :                 /* SYN-data is counted as two separate packets in tcp_ack() */
    6010           0 :                 if (tp->delivered > 1)
    6011           0 :                         --tp->delivered;
    6012             :         }
    6013             : 
    6014           0 :         tcp_fastopen_add_skb(sk, synack);
    6015             : 
    6016           0 :         return false;
    6017             : }
    6018             : 
    6019           0 : static void smc_check_reset_syn(struct tcp_sock *tp)
    6020             : {
    6021             : #if IS_ENABLED(CONFIG_SMC)
    6022             :         if (static_branch_unlikely(&tcp_have_smc)) {
    6023             :                 if (tp->syn_smc && !tp->rx_opt.smc_ok)
    6024             :                         tp->syn_smc = 0;
    6025             :         }
    6026             : #endif
    6027           0 : }
    6028             : 
    6029           4 : static void tcp_try_undo_spurious_syn(struct sock *sk)
    6030             : {
    6031           4 :         struct tcp_sock *tp = tcp_sk(sk);
    6032           4 :         u32 syn_stamp;
    6033             : 
    6034             :         /* undo_marker is set when SYN or SYNACK times out. The timeout is
    6035             :          * spurious if the ACK's timestamp option echo value matches the
    6036             :          * original SYN timestamp.
    6037             :          */
    6038           4 :         syn_stamp = tp->retrans_stamp;
    6039           4 :         if (tp->undo_marker && syn_stamp && tp->rx_opt.saw_tstamp &&
    6040           0 :             syn_stamp == tp->rx_opt.rcv_tsecr)
    6041           0 :                 tp->undo_marker = 0;
    6042           4 : }
    6043             : 
    6044           0 : static int tcp_rcv_synsent_state_process(struct sock *sk, struct sk_buff *skb,
    6045             :                                          const struct tcphdr *th)
    6046             : {
    6047           0 :         struct inet_connection_sock *icsk = inet_csk(sk);
    6048           0 :         struct tcp_sock *tp = tcp_sk(sk);
    6049           0 :         struct tcp_fastopen_cookie foc = { .len = -1 };
    6050           0 :         int saved_clamp = tp->rx_opt.mss_clamp;
    6051           0 :         bool fastopen_fail;
    6052             : 
    6053           0 :         tcp_parse_options(sock_net(sk), skb, &tp->rx_opt, 0, &foc);
    6054           0 :         if (tp->rx_opt.saw_tstamp && tp->rx_opt.rcv_tsecr)
    6055           0 :                 tp->rx_opt.rcv_tsecr -= tp->tsoffset;
    6056             : 
    6057           0 :         if (th->ack) {
    6058             :                 /* rfc793:
    6059             :                  * "If the state is SYN-SENT then
    6060             :                  *    first check the ACK bit
    6061             :                  *      If the ACK bit is set
    6062             :                  *        If SEG.ACK =< ISS, or SEG.ACK > SND.NXT, send
    6063             :                  *        a reset (unless the RST bit is set, if so drop
    6064             :                  *        the segment and return)"
    6065             :                  */
    6066           0 :                 if (!after(TCP_SKB_CB(skb)->ack_seq, tp->snd_una) ||
    6067           0 :                     after(TCP_SKB_CB(skb)->ack_seq, tp->snd_nxt)) {
    6068             :                         /* Previous FIN/ACK or RST/ACK might be ignored. */
    6069           0 :                         if (icsk->icsk_retransmits == 0)
    6070           0 :                                 inet_csk_reset_xmit_timer(sk,
    6071             :                                                 ICSK_TIME_RETRANS,
    6072             :                                                 TCP_TIMEOUT_MIN, TCP_RTO_MAX);
    6073           0 :                         goto reset_and_undo;
    6074             :                 }
    6075             : 
    6076           0 :                 if (tp->rx_opt.saw_tstamp && tp->rx_opt.rcv_tsecr &&
    6077           0 :                     !between(tp->rx_opt.rcv_tsecr, tp->retrans_stamp,
    6078             :                              tcp_time_stamp(tp))) {
    6079           0 :                         NET_INC_STATS(sock_net(sk),
    6080             :                                         LINUX_MIB_PAWSACTIVEREJECTED);
    6081           0 :                         goto reset_and_undo;
    6082             :                 }
    6083             : 
    6084             :                 /* Now ACK is acceptable.
    6085             :                  *
    6086             :                  * "If the RST bit is set
    6087             :                  *    If the ACK was acceptable then signal the user "error:
    6088             :                  *    connection reset", drop the segment, enter CLOSED state,
    6089             :                  *    delete TCB, and return."
    6090             :                  */
    6091             : 
    6092           0 :                 if (th->rst) {
    6093           0 :                         tcp_reset(sk, skb);
    6094           0 :                         goto discard;
    6095             :                 }
    6096             : 
    6097             :                 /* rfc793:
    6098             :                  *   "fifth, if neither of the SYN or RST bits is set then
    6099             :                  *    drop the segment and return."
    6100             :                  *
    6101             :                  *    See note below!
    6102             :                  *                                        --ANK(990513)
    6103             :                  */
    6104           0 :                 if (!th->syn)
    6105           0 :                         goto discard_and_undo;
    6106             : 
    6107             :                 /* rfc793:
    6108             :                  *   "If the SYN bit is on ...
    6109             :                  *    are acceptable then ...
    6110             :                  *    (our SYN has been ACKed), change the connection
    6111             :                  *    state to ESTABLISHED..."
    6112             :                  */
    6113             : 
    6114           0 :                 tcp_ecn_rcv_synack(tp, th);
    6115             : 
    6116           0 :                 tcp_init_wl(tp, TCP_SKB_CB(skb)->seq);
    6117           0 :                 tcp_try_undo_spurious_syn(sk);
    6118           0 :                 tcp_ack(sk, skb, FLAG_SLOWPATH);
    6119             : 
    6120             :                 /* Ok.. it's good. Set up sequence numbers and
    6121             :                  * move to established.
    6122             :                  */
    6123           0 :                 WRITE_ONCE(tp->rcv_nxt, TCP_SKB_CB(skb)->seq + 1);
    6124           0 :                 tp->rcv_wup = TCP_SKB_CB(skb)->seq + 1;
    6125             : 
    6126             :                 /* RFC1323: The window in SYN & SYN/ACK segments is
    6127             :                  * never scaled.
    6128             :                  */
    6129           0 :                 tp->snd_wnd = ntohs(th->window);
    6130             : 
    6131           0 :                 if (!tp->rx_opt.wscale_ok) {
    6132           0 :                         tp->rx_opt.snd_wscale = tp->rx_opt.rcv_wscale = 0;
    6133           0 :                         tp->window_clamp = min(tp->window_clamp, 65535U);
    6134             :                 }
    6135             : 
    6136           0 :                 if (tp->rx_opt.saw_tstamp) {
    6137           0 :                         tp->rx_opt.tstamp_ok    = 1;
    6138           0 :                         tp->tcp_header_len =
    6139             :                                 sizeof(struct tcphdr) + TCPOLEN_TSTAMP_ALIGNED;
    6140           0 :                         tp->advmss       -= TCPOLEN_TSTAMP_ALIGNED;
    6141           0 :                         tcp_store_ts_recent(tp);
    6142             :                 } else {
    6143           0 :                         tp->tcp_header_len = sizeof(struct tcphdr);
    6144             :                 }
    6145             : 
    6146           0 :                 tcp_sync_mss(sk, icsk->icsk_pmtu_cookie);
    6147           0 :                 tcp_initialize_rcv_mss(sk);
    6148             : 
    6149             :                 /* Remember, tcp_poll() does not lock socket!
    6150             :                  * Change state from SYN-SENT only after copied_seq
    6151             :                  * is initialized. */
    6152           0 :                 WRITE_ONCE(tp->copied_seq, tp->rcv_nxt);
    6153             : 
    6154           0 :                 smc_check_reset_syn(tp);
    6155             : 
    6156           0 :                 smp_mb();
    6157             : 
    6158           0 :                 tcp_finish_connect(sk, skb);
    6159             : 
    6160           0 :                 fastopen_fail = (tp->syn_fastopen || tp->syn_data) &&
    6161           0 :                                 tcp_rcv_fastopen_synack(sk, skb, &foc);
    6162             : 
    6163           0 :                 if (!sock_flag(sk, SOCK_DEAD)) {
    6164           0 :                         sk->sk_state_change(sk);
    6165           0 :                         sk_wake_async(sk, SOCK_WAKE_IO, POLL_OUT);
    6166             :                 }
    6167           0 :                 if (fastopen_fail)
    6168             :                         return -1;
    6169           0 :                 if (sk->sk_write_pending ||
    6170           0 :                     icsk->icsk_accept_queue.rskq_defer_accept ||
    6171           0 :                     inet_csk_in_pingpong_mode(sk)) {
    6172             :                         /* Save one ACK. Data will be ready after
    6173             :                          * several ticks, if write_pending is set.
    6174             :                          *
    6175             :                          * It may be deleted, but with this feature tcpdumps
    6176             :                          * look so _wonderfully_ clever, that I was not able
    6177             :                          * to stand against the temptation 8)     --ANK
    6178             :                          */
    6179           0 :                         inet_csk_schedule_ack(sk);
    6180           0 :                         tcp_enter_quickack_mode(sk, TCP_MAX_QUICKACKS);
    6181           0 :                         inet_csk_reset_xmit_timer(sk, ICSK_TIME_DACK,
    6182             :                                                   TCP_DELACK_MAX, TCP_RTO_MAX);
    6183             : 
    6184           0 : discard:
    6185           0 :                         tcp_drop(sk, skb);
    6186           0 :                         return 0;
    6187             :                 } else {
    6188           0 :                         tcp_send_ack(sk);
    6189             :                 }
    6190           0 :                 return -1;
    6191             :         }
    6192             : 
    6193             :         /* No ACK in the segment */
    6194             : 
    6195           0 :         if (th->rst) {
    6196             :                 /* rfc793:
    6197             :                  * "If the RST bit is set
    6198             :                  *
    6199             :                  *      Otherwise (no ACK) drop the segment and return."
    6200             :                  */
    6201             : 
    6202           0 :                 goto discard_and_undo;
    6203             :         }
    6204             : 
    6205             :         /* PAWS check. */
    6206           0 :         if (tp->rx_opt.ts_recent_stamp && tp->rx_opt.saw_tstamp &&
    6207           0 :             tcp_paws_reject(&tp->rx_opt, 0))
    6208           0 :                 goto discard_and_undo;
    6209             : 
    6210           0 :         if (th->syn) {
    6211             :                 /* We see SYN without ACK. It is attempt of
    6212             :                  * simultaneous connect with crossed SYNs.
    6213             :                  * Particularly, it can be connect to self.
    6214             :                  */
    6215           0 :                 tcp_set_state(sk, TCP_SYN_RECV);
    6216             : 
    6217           0 :                 if (tp->rx_opt.saw_tstamp) {
    6218           0 :                         tp->rx_opt.tstamp_ok = 1;
    6219           0 :                         tcp_store_ts_recent(tp);
    6220           0 :                         tp->tcp_header_len =
    6221             :                                 sizeof(struct tcphdr) + TCPOLEN_TSTAMP_ALIGNED;
    6222             :                 } else {
    6223           0 :                         tp->tcp_header_len = sizeof(struct tcphdr);
    6224             :                 }
    6225             : 
    6226           0 :                 WRITE_ONCE(tp->rcv_nxt, TCP_SKB_CB(skb)->seq + 1);
    6227           0 :                 WRITE_ONCE(tp->copied_seq, tp->rcv_nxt);
    6228           0 :                 tp->rcv_wup = TCP_SKB_CB(skb)->seq + 1;
    6229             : 
    6230             :                 /* RFC1323: The window in SYN & SYN/ACK segments is
    6231             :                  * never scaled.
    6232             :                  */
    6233           0 :                 tp->snd_wnd    = ntohs(th->window);
    6234           0 :                 tp->snd_wl1    = TCP_SKB_CB(skb)->seq;
    6235           0 :                 tp->max_window = tp->snd_wnd;
    6236             : 
    6237           0 :                 tcp_ecn_rcv_syn(tp, th);
    6238             : 
    6239           0 :                 tcp_mtup_init(sk);
    6240           0 :                 tcp_sync_mss(sk, icsk->icsk_pmtu_cookie);
    6241           0 :                 tcp_initialize_rcv_mss(sk);
    6242             : 
    6243           0 :                 tcp_send_synack(sk);
    6244             : #if 0
    6245             :                 /* Note, we could accept data and URG from this segment.
    6246             :                  * There are no obstacles to make this (except that we must
    6247             :                  * either change tcp_recvmsg() to prevent it from returning data
    6248             :                  * before 3WHS completes per RFC793, or employ TCP Fast Open).
    6249             :                  *
    6250             :                  * However, if we ignore data in ACKless segments sometimes,
    6251             :                  * we have no reasons to accept it sometimes.
    6252             :                  * Also, seems the code doing it in step6 of tcp_rcv_state_process
    6253             :                  * is not flawless. So, discard packet for sanity.
    6254             :                  * Uncomment this return to process the data.
    6255             :                  */
    6256             :                 return -1;
    6257             : #else
    6258           0 :                 goto discard;
    6259             : #endif
    6260             :         }
    6261             :         /* "fifth, if neither of the SYN or RST bits is set then
    6262             :          * drop the segment and return."
    6263             :          */
    6264             : 
    6265           0 : discard_and_undo:
    6266           0 :         tcp_clear_options(&tp->rx_opt);
    6267           0 :         tp->rx_opt.mss_clamp = saved_clamp;
    6268           0 :         goto discard;
    6269             : 
    6270           0 : reset_and_undo:
    6271           0 :         tcp_clear_options(&tp->rx_opt);
    6272           0 :         tp->rx_opt.mss_clamp = saved_clamp;
    6273           0 :         return 1;
    6274             : }
    6275             : 
    6276           0 : static void tcp_rcv_synrecv_state_fastopen(struct sock *sk)
    6277             : {
    6278           0 :         struct request_sock *req;
    6279             : 
    6280             :         /* If we are still handling the SYNACK RTO, see if timestamp ECR allows
    6281             :          * undo. If peer SACKs triggered fast recovery, we can't undo here.
    6282             :          */
    6283           0 :         if (inet_csk(sk)->icsk_ca_state == TCP_CA_Loss)
    6284           0 :                 tcp_try_undo_loss(sk, false);
    6285             : 
    6286             :         /* Reset rtx states to prevent spurious retransmits_timed_out() */
    6287           0 :         tcp_sk(sk)->retrans_stamp = 0;
    6288           0 :         inet_csk(sk)->icsk_retransmits = 0;
    6289             : 
    6290             :         /* Once we leave TCP_SYN_RECV or TCP_FIN_WAIT_1,
    6291             :          * we no longer need req so release it.
    6292             :          */
    6293           0 :         req = rcu_dereference_protected(tcp_sk(sk)->fastopen_rsk,
    6294             :                                         lockdep_sock_is_held(sk));
    6295           0 :         reqsk_fastopen_remove(sk, req, false);
    6296             : 
    6297             :         /* Re-arm the timer because data may have been sent out.
    6298             :          * This is similar to the regular data transmission case
    6299             :          * when new data has just been ack'ed.
    6300             :          *
    6301             :          * (TFO) - we could try to be more aggressive and
    6302             :          * retransmitting any data sooner based on when they
    6303             :          * are sent out.
    6304             :          */
    6305           0 :         tcp_rearm_rto(sk);
    6306           0 : }
    6307             : 
    6308             : /*
    6309             :  *      This function implements the receiving procedure of RFC 793 for
    6310             :  *      all states except ESTABLISHED and TIME_WAIT.
    6311             :  *      It's called from both tcp_v4_rcv and tcp_v6_rcv and should be
    6312             :  *      address independent.
    6313             :  */
    6314             : 
    6315          11 : int tcp_rcv_state_process(struct sock *sk, struct sk_buff *skb)
    6316             : {
    6317          11 :         struct tcp_sock *tp = tcp_sk(sk);
    6318          11 :         struct inet_connection_sock *icsk = inet_csk(sk);
    6319          11 :         const struct tcphdr *th = tcp_hdr(skb);
    6320          11 :         struct request_sock *req;
    6321          11 :         int queued = 0;
    6322          11 :         bool acceptable;
    6323             : 
    6324          11 :         switch (sk->sk_state) {
    6325           0 :         case TCP_CLOSE:
    6326           0 :                 goto discard;
    6327             : 
    6328           4 :         case TCP_LISTEN:
    6329           4 :                 if (th->ack)
    6330             :                         return 1;
    6331             : 
    6332           4 :                 if (th->rst)
    6333           0 :                         goto discard;
    6334             : 
    6335           4 :                 if (th->syn) {
    6336           4 :                         if (th->fin)
    6337           0 :                                 goto discard;
    6338             :                         /* It is possible that we process SYN packets from backlog,
    6339             :                          * so we need to make sure to disable BH and RCU right there.
    6340             :                          */
    6341           4 :                         rcu_read_lock();
    6342           4 :                         local_bh_disable();
    6343           4 :                         acceptable = icsk->icsk_af_ops->conn_request(sk, skb) >= 0;
    6344           4 :                         local_bh_enable();
    6345           4 :                         rcu_read_unlock();
    6346             : 
    6347           4 :                         if (!acceptable)
    6348             :                                 return 1;
    6349           4 :                         consume_skb(skb);
    6350           4 :                         return 0;
    6351             :                 }
    6352           0 :                 goto discard;
    6353             : 
    6354           0 :         case TCP_SYN_SENT:
    6355           0 :                 tp->rx_opt.saw_tstamp = 0;
    6356           0 :                 tcp_mstamp_refresh(tp);
    6357           0 :                 queued = tcp_rcv_synsent_state_process(sk, skb, th);
    6358           0 :                 if (queued >= 0)
    6359             :                         return queued;
    6360             : 
    6361             :                 /* Do step6 onward by hand. */
    6362           0 :                 tcp_urg(sk, skb, th);
    6363           0 :                 __kfree_skb(skb);
    6364           0 :                 tcp_data_snd_check(sk);
    6365           0 :                 return 0;
    6366             :         }
    6367             : 
    6368           7 :         tcp_mstamp_refresh(tp);
    6369           7 :         tp->rx_opt.saw_tstamp = 0;
    6370           7 :         req = rcu_dereference_protected(tp->fastopen_rsk,
    6371             :                                         lockdep_sock_is_held(sk));
    6372           7 :         if (req) {
    6373           0 :                 bool req_stolen;
    6374             : 
    6375           0 :                 WARN_ON_ONCE(sk->sk_state != TCP_SYN_RECV &&
    6376             :                     sk->sk_state != TCP_FIN_WAIT1);
    6377             : 
    6378           0 :                 if (!tcp_check_req(sk, skb, req, true, &req_stolen))
    6379           0 :                         goto discard;
    6380             :         }
    6381             : 
    6382           7 :         if (!th->ack && !th->rst && !th->syn)
    6383           0 :                 goto discard;
    6384             : 
    6385           7 :         if (!tcp_validate_incoming(sk, skb, th, 0))
    6386             :                 return 0;
    6387             : 
    6388             :         /* step 5: check the ACK field */
    6389           7 :         acceptable = tcp_ack(sk, skb, FLAG_SLOWPATH |
    6390             :                                       FLAG_UPDATE_TS_RECENT |
    6391             :                                       FLAG_NO_CHALLENGE_ACK) > 0;
    6392             : 
    6393           7 :         if (!acceptable) {
    6394           0 :                 if (sk->sk_state == TCP_SYN_RECV)
    6395             :                         return 1;       /* send one RST */
    6396           0 :                 tcp_send_challenge_ack(sk, skb);
    6397           0 :                 goto discard;
    6398             :         }
    6399           7 :         switch (sk->sk_state) {
    6400           4 :         case TCP_SYN_RECV:
    6401           4 :                 tp->delivered++; /* SYN-ACK delivery isn't tracked in tcp_ack */
    6402           4 :                 if (!tp->srtt_us)
    6403           0 :                         tcp_synack_rtt_meas(sk, req);
    6404             : 
    6405           4 :                 if (req) {
    6406           0 :                         tcp_rcv_synrecv_state_fastopen(sk);
    6407             :                 } else {
    6408           4 :                         tcp_try_undo_spurious_syn(sk);
    6409           4 :                         tp->retrans_stamp = 0;
    6410           4 :                         tcp_init_transfer(sk, BPF_SOCK_OPS_PASSIVE_ESTABLISHED_CB,
    6411             :                                           skb);
    6412           4 :                         WRITE_ONCE(tp->copied_seq, tp->rcv_nxt);
    6413             :                 }
    6414           4 :                 smp_mb();
    6415           4 :                 tcp_set_state(sk, TCP_ESTABLISHED);
    6416           4 :                 sk->sk_state_change(sk);
    6417             : 
    6418             :                 /* Note, that this wakeup is only for marginal crossed SYN case.
    6419             :                  * Passively open sockets are not waked up, because
    6420             :                  * sk->sk_sleep == NULL and sk->sk_socket == NULL.
    6421             :                  */
    6422           4 :                 if (sk->sk_socket)
    6423           0 :                         sk_wake_async(sk, SOCK_WAKE_IO, POLL_OUT);
    6424             : 
    6425           4 :                 tp->snd_una = TCP_SKB_CB(skb)->ack_seq;
    6426           4 :                 tp->snd_wnd = ntohs(th->window) << tp->rx_opt.snd_wscale;
    6427           4 :                 tcp_init_wl(tp, TCP_SKB_CB(skb)->seq);
    6428             : 
    6429           4 :                 if (tp->rx_opt.tstamp_ok)
    6430           0 :                         tp->advmss -= TCPOLEN_TSTAMP_ALIGNED;
    6431             : 
    6432           4 :                 if (!inet_csk(sk)->icsk_ca_ops->cong_control)
    6433           4 :                         tcp_update_pacing_rate(sk);
    6434             : 
    6435             :                 /* Prevent spurious tcp_cwnd_restart() on first data packet */
    6436           4 :                 tp->lsndtime = tcp_jiffies32;
    6437             : 
    6438           4 :                 tcp_initialize_rcv_mss(sk);
    6439           4 :                 tcp_fast_path_on(tp);
    6440             :                 break;
    6441             : 
    6442           0 :         case TCP_FIN_WAIT1: {
    6443           0 :                 int tmo;
    6444             : 
    6445           0 :                 if (req)
    6446           0 :                         tcp_rcv_synrecv_state_fastopen(sk);
    6447             : 
    6448           0 :                 if (tp->snd_una != tp->write_seq)
    6449             :                         break;
    6450             : 
    6451           0 :                 tcp_set_state(sk, TCP_FIN_WAIT2);
    6452           0 :                 sk->sk_shutdown |= SEND_SHUTDOWN;
    6453             : 
    6454           0 :                 sk_dst_confirm(sk);
    6455             : 
    6456           0 :                 if (!sock_flag(sk, SOCK_DEAD)) {
    6457             :                         /* Wake up lingering close() */
    6458           0 :                         sk->sk_state_change(sk);
    6459           0 :                         break;
    6460             :                 }
    6461             : 
    6462           0 :                 if (tp->linger2 < 0) {
    6463           0 :                         tcp_done(sk);
    6464           0 :                         NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPABORTONDATA);
    6465           0 :                         return 1;
    6466             :                 }
    6467           0 :                 if (TCP_SKB_CB(skb)->end_seq != TCP_SKB_CB(skb)->seq &&
    6468           0 :                     after(TCP_SKB_CB(skb)->end_seq - th->fin, tp->rcv_nxt)) {
    6469             :                         /* Receive out of order FIN after close() */
    6470           0 :                         if (tp->syn_fastopen && th->fin)
    6471           0 :                                 tcp_fastopen_active_disable(sk);
    6472           0 :                         tcp_done(sk);
    6473           0 :                         NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPABORTONDATA);
    6474           0 :                         return 1;
    6475             :                 }
    6476             : 
    6477           0 :                 tmo = tcp_fin_time(sk);
    6478           0 :                 if (tmo > TCP_TIMEWAIT_LEN) {
    6479           0 :                         inet_csk_reset_keepalive_timer(sk, tmo - TCP_TIMEWAIT_LEN);
    6480           0 :                 } else if (th->fin || sock_owned_by_user(sk)) {
    6481             :                         /* Bad case. We could lose such FIN otherwise.
    6482             :                          * It is not a big problem, but it looks confusing
    6483             :                          * and not so rare event. We still can lose it now,
    6484             :                          * if it spins in bh_lock_sock(), but it is really
    6485             :                          * marginal case.
    6486             :                          */
    6487           0 :                         inet_csk_reset_keepalive_timer(sk, tmo);
    6488             :                 } else {
    6489           0 :                         tcp_time_wait(sk, TCP_FIN_WAIT2, tmo);
    6490           0 :                         goto discard;
    6491             :                 }
    6492             :                 break;
    6493             :         }
    6494             : 
    6495           0 :         case TCP_CLOSING:
    6496           0 :                 if (tp->snd_una == tp->write_seq) {
    6497           0 :                         tcp_time_wait(sk, TCP_TIME_WAIT, 0);
    6498           0 :                         goto discard;
    6499             :                 }
    6500             :                 break;
    6501             : 
    6502           3 :         case TCP_LAST_ACK:
    6503           3 :                 if (tp->snd_una == tp->write_seq) {
    6504           3 :                         tcp_update_metrics(sk);
    6505           3 :                         tcp_done(sk);
    6506           3 :                         goto discard;
    6507             :                 }
    6508             :                 break;
    6509             :         }
    6510             : 
    6511             :         /* step 6: check the URG bit */
    6512           4 :         tcp_urg(sk, skb, th);
    6513             : 
    6514             :         /* step 7: process the segment text */
    6515           4 :         switch (sk->sk_state) {
    6516           0 :         case TCP_CLOSE_WAIT:
    6517             :         case TCP_CLOSING:
    6518             :         case TCP_LAST_ACK:
    6519           0 :                 if (!before(TCP_SKB_CB(skb)->seq, tp->rcv_nxt)) {
    6520           4 :                         if (sk_is_mptcp(sk))
    6521           4 :                                 mptcp_incoming_options(sk, skb);
    6522             :                         break;
    6523             :                 }
    6524           0 :                 fallthrough;
    6525             :         case TCP_FIN_WAIT1:
    6526             :         case TCP_FIN_WAIT2:
    6527             :                 /* RFC 793 says to queue data in these states,
    6528             :                  * RFC 1122 says we MUST send a reset.
    6529             :                  * BSD 4.4 also does reset.
    6530             :                  */
    6531           0 :                 if (sk->sk_shutdown & RCV_SHUTDOWN) {
    6532           0 :                         if (TCP_SKB_CB(skb)->end_seq != TCP_SKB_CB(skb)->seq &&
    6533           0 :                             after(TCP_SKB_CB(skb)->end_seq - th->fin, tp->rcv_nxt)) {
    6534           0 :                                 NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPABORTONDATA);
    6535           0 :                                 tcp_reset(sk, skb);
    6536           0 :                                 return 1;
    6537             :                         }
    6538             :                 }
    6539           4 :                 fallthrough;
    6540             :         case TCP_ESTABLISHED:
    6541           4 :                 tcp_data_queue(sk, skb);
    6542           4 :                 queued = 1;
    6543           4 :                 break;
    6544             :         }
    6545             : 
    6546             :         /* tcp_data could move socket to TIME-WAIT */
    6547           4 :         if (sk->sk_state != TCP_CLOSE) {
    6548           4 :                 tcp_data_snd_check(sk);
    6549           4 :                 tcp_ack_snd_check(sk);
    6550             :         }
    6551             : 
    6552           4 :         if (!queued) {
    6553           0 : discard:
    6554           3 :                 tcp_drop(sk, skb);
    6555             :         }
    6556             :         return 0;
    6557             : }
    6558             : EXPORT_SYMBOL(tcp_rcv_state_process);
    6559             : 
    6560           0 : static inline void pr_drop_req(struct request_sock *req, __u16 port, int family)
    6561             : {
    6562           0 :         struct inet_request_sock *ireq = inet_rsk(req);
    6563             : 
    6564           0 :         if (family == AF_INET)
    6565           0 :                 net_dbg_ratelimited("drop open request from %pI4/%u\n",
    6566             :                                     &ireq->ir_rmt_addr, port);
    6567             : #if IS_ENABLED(CONFIG_IPV6)
    6568             :         else if (family == AF_INET6)
    6569             :                 net_dbg_ratelimited("drop open request from %pI6/%u\n",
    6570             :                                     &ireq->ir_v6_rmt_addr, port);
    6571             : #endif
    6572             : }
    6573             : 
    6574             : /* RFC3168 : 6.1.1 SYN packets must not have ECT/ECN bits set
    6575             :  *
    6576             :  * If we receive a SYN packet with these bits set, it means a
    6577             :  * network is playing bad games with TOS bits. In order to
    6578             :  * avoid possible false congestion notifications, we disable
    6579             :  * TCP ECN negotiation.
    6580             :  *
    6581             :  * Exception: tcp_ca wants ECN. This is required for DCTCP
    6582             :  * congestion control: Linux DCTCP asserts ECT on all packets,
    6583             :  * including SYN, which is most optimal solution; however,
    6584             :  * others, such as FreeBSD do not.
    6585             :  *
    6586             :  * Exception: At least one of the reserved bits of the TCP header (th->res1) is
    6587             :  * set, indicating the use of a future TCP extension (such as AccECN). See
    6588             :  * RFC8311 ยง4.3 which updates RFC3168 to allow the development of such
    6589             :  * extensions.
    6590             :  */
    6591           4 : static void tcp_ecn_create_request(struct request_sock *req,
    6592             :                                    const struct sk_buff *skb,
    6593             :                                    const struct sock *listen_sk,
    6594             :                                    const struct dst_entry *dst)
    6595             : {
    6596           4 :         const struct tcphdr *th = tcp_hdr(skb);
    6597           4 :         const struct net *net = sock_net(listen_sk);
    6598           4 :         bool th_ecn = th->ece && th->cwr;
    6599           4 :         bool ect, ecn_ok;
    6600           4 :         u32 ecn_ok_dst;
    6601             : 
    6602           4 :         if (!th_ecn)
    6603             :                 return;
    6604             : 
    6605           0 :         ect = !INET_ECN_is_not_ect(TCP_SKB_CB(skb)->ip_dsfield);
    6606           0 :         ecn_ok_dst = dst_feature(dst, DST_FEATURE_ECN_MASK);
    6607           0 :         ecn_ok = net->ipv4.sysctl_tcp_ecn || ecn_ok_dst;
    6608             : 
    6609           0 :         if (((!ect || th->res1) && ecn_ok) || tcp_ca_needs_ecn(listen_sk) ||
    6610           0 :             (ecn_ok_dst & DST_FEATURE_ECN_CA) ||
    6611           0 :             tcp_bpf_ca_needs_ecn((struct sock *)req))
    6612           0 :                 inet_rsk(req)->ecn_ok = 1;
    6613             : }
    6614             : 
    6615           4 : static void tcp_openreq_init(struct request_sock *req,
    6616             :                              const struct tcp_options_received *rx_opt,
    6617             :                              struct sk_buff *skb, const struct sock *sk)
    6618             : {
    6619           4 :         struct inet_request_sock *ireq = inet_rsk(req);
    6620             : 
    6621           4 :         req->rsk_rcv_wnd = 0;                /* So that tcp_send_synack() knows! */
    6622           4 :         tcp_rsk(req)->rcv_isn = TCP_SKB_CB(skb)->seq;
    6623           4 :         tcp_rsk(req)->rcv_nxt = TCP_SKB_CB(skb)->seq + 1;
    6624           4 :         tcp_rsk(req)->snt_synack = 0;
    6625           4 :         tcp_rsk(req)->last_oow_ack_time = 0;
    6626           4 :         req->mss = rx_opt->mss_clamp;
    6627           4 :         req->ts_recent = rx_opt->saw_tstamp ? rx_opt->rcv_tsval : 0;
    6628           4 :         ireq->tstamp_ok = rx_opt->tstamp_ok;
    6629           4 :         ireq->sack_ok = rx_opt->sack_ok;
    6630           4 :         ireq->snd_wscale = rx_opt->snd_wscale;
    6631           4 :         ireq->wscale_ok = rx_opt->wscale_ok;
    6632           4 :         ireq->acked = 0;
    6633           4 :         ireq->ecn_ok = 0;
    6634           4 :         ireq->ir_rmt_port = tcp_hdr(skb)->source;
    6635           4 :         ireq->ir_num = ntohs(tcp_hdr(skb)->dest);
    6636           4 :         ireq->ir_mark = inet_request_mark(sk, skb);
    6637             : #if IS_ENABLED(CONFIG_SMC)
    6638             :         ireq->smc_ok = rx_opt->smc_ok;
    6639             : #endif
    6640           4 : }
    6641             : 
    6642           4 : struct request_sock *inet_reqsk_alloc(const struct request_sock_ops *ops,
    6643             :                                       struct sock *sk_listener,
    6644             :                                       bool attach_listener)
    6645             : {
    6646           4 :         struct request_sock *req = reqsk_alloc(ops, sk_listener,
    6647             :                                                attach_listener);
    6648             : 
    6649           4 :         if (req) {
    6650           4 :                 struct inet_request_sock *ireq = inet_rsk(req);
    6651             : 
    6652           4 :                 ireq->ireq_opt = NULL;
    6653             : #if IS_ENABLED(CONFIG_IPV6)
    6654             :                 ireq->pktopts = NULL;
    6655             : #endif
    6656           4 :                 atomic64_set(&ireq->ir_cookie, 0);
    6657           4 :                 ireq->ireq_state = TCP_NEW_SYN_RECV;
    6658           4 :                 write_pnet(&ireq->ireq_net, sock_net(sk_listener));
    6659           4 :                 ireq->ireq_family = sk_listener->sk_family;
    6660             :         }
    6661             : 
    6662           4 :         return req;
    6663             : }
    6664             : EXPORT_SYMBOL(inet_reqsk_alloc);
    6665             : 
    6666             : /*
    6667             :  * Return true if a syncookie should be sent
    6668             :  */
    6669           0 : static bool tcp_syn_flood_action(const struct sock *sk, const char *proto)
    6670             : {
    6671           0 :         struct request_sock_queue *queue = &inet_csk(sk)->icsk_accept_queue;
    6672           0 :         const char *msg = "Dropping request";
    6673           0 :         bool want_cookie = false;
    6674           0 :         struct net *net = sock_net(sk);
    6675             : 
    6676             : #ifdef CONFIG_SYN_COOKIES
    6677             :         if (net->ipv4.sysctl_tcp_syncookies) {
    6678             :                 msg = "Sending cookies";
    6679             :                 want_cookie = true;
    6680             :                 __NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPREQQFULLDOCOOKIES);
    6681             :         } else
    6682             : #endif
    6683           0 :                 __NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPREQQFULLDROP);
    6684             : 
    6685           0 :         if (!queue->synflood_warned &&
    6686           0 :             net->ipv4.sysctl_tcp_syncookies != 2 &&
    6687           0 :             xchg(&queue->synflood_warned, 1) == 0)
    6688           0 :                 net_info_ratelimited("%s: Possible SYN flooding on port %d. %s.  Check SNMP counters.\n",
    6689             :                                      proto, sk->sk_num, msg);
    6690             : 
    6691           0 :         return want_cookie;
    6692             : }
    6693             : 
    6694           4 : static void tcp_reqsk_record_syn(const struct sock *sk,
    6695             :                                  struct request_sock *req,
    6696             :                                  const struct sk_buff *skb)
    6697             : {
    6698           4 :         if (tcp_sk(sk)->save_syn) {
    6699           0 :                 u32 len = skb_network_header_len(skb) + tcp_hdrlen(skb);
    6700           0 :                 struct saved_syn *saved_syn;
    6701           0 :                 u32 mac_hdrlen;
    6702           0 :                 void *base;
    6703             : 
    6704           0 :                 if (tcp_sk(sk)->save_syn == 2) {  /* Save full header. */
    6705           0 :                         base = skb_mac_header(skb);
    6706           0 :                         mac_hdrlen = skb_mac_header_len(skb);
    6707           0 :                         len += mac_hdrlen;
    6708             :                 } else {
    6709           0 :                         base = skb_network_header(skb);
    6710           0 :                         mac_hdrlen = 0;
    6711             :                 }
    6712             : 
    6713           0 :                 saved_syn = kmalloc(struct_size(saved_syn, data, len),
    6714             :                                     GFP_ATOMIC);
    6715           0 :                 if (saved_syn) {
    6716           0 :                         saved_syn->mac_hdrlen = mac_hdrlen;
    6717           0 :                         saved_syn->network_hdrlen = skb_network_header_len(skb);
    6718           0 :                         saved_syn->tcp_hdrlen = tcp_hdrlen(skb);
    6719           0 :                         memcpy(saved_syn->data, base, len);
    6720           0 :                         req->saved_syn = saved_syn;
    6721             :                 }
    6722             :         }
    6723           4 : }
    6724             : 
    6725             : /* If a SYN cookie is required and supported, returns a clamped MSS value to be
    6726             :  * used for SYN cookie generation.
    6727             :  */
    6728           0 : u16 tcp_get_syncookie_mss(struct request_sock_ops *rsk_ops,
    6729             :                           const struct tcp_request_sock_ops *af_ops,
    6730             :                           struct sock *sk, struct tcphdr *th)
    6731             : {
    6732           0 :         struct tcp_sock *tp = tcp_sk(sk);
    6733           0 :         u16 mss;
    6734             : 
    6735           0 :         if (sock_net(sk)->ipv4.sysctl_tcp_syncookies != 2 &&
    6736           0 :             !inet_csk_reqsk_queue_is_full(sk))
    6737             :                 return 0;
    6738             : 
    6739           0 :         if (!tcp_syn_flood_action(sk, rsk_ops->slab_name))
    6740             :                 return 0;
    6741             : 
    6742           0 :         if (sk_acceptq_is_full(sk)) {
    6743           0 :                 NET_INC_STATS(sock_net(sk), LINUX_MIB_LISTENOVERFLOWS);
    6744           0 :                 return 0;
    6745             :         }
    6746             : 
    6747           0 :         mss = tcp_parse_mss_option(th, tp->rx_opt.user_mss);
    6748           0 :         if (!mss)
    6749           0 :                 mss = af_ops->mss_clamp;
    6750             : 
    6751             :         return mss;
    6752             : }
    6753             : EXPORT_SYMBOL_GPL(tcp_get_syncookie_mss);
    6754             : 
    6755           4 : int tcp_conn_request(struct request_sock_ops *rsk_ops,
    6756             :                      const struct tcp_request_sock_ops *af_ops,
    6757             :                      struct sock *sk, struct sk_buff *skb)
    6758             : {
    6759           4 :         struct tcp_fastopen_cookie foc = { .len = -1 };
    6760           4 :         __u32 isn = TCP_SKB_CB(skb)->tcp_tw_isn;
    6761           4 :         struct tcp_options_received tmp_opt;
    6762           4 :         struct tcp_sock *tp = tcp_sk(sk);
    6763           4 :         struct net *net = sock_net(sk);
    6764           4 :         struct sock *fastopen_sk = NULL;
    6765           4 :         struct request_sock *req;
    6766           4 :         bool want_cookie = false;
    6767           4 :         struct dst_entry *dst;
    6768           4 :         struct flowi fl;
    6769             : 
    6770             :         /* TW buckets are converted to open requests without
    6771             :          * limitations, they conserve resources and peer is
    6772             :          * evidently real one.
    6773             :          */
    6774           4 :         if ((net->ipv4.sysctl_tcp_syncookies == 2 ||
    6775           4 :              inet_csk_reqsk_queue_is_full(sk)) && !isn) {
    6776           0 :                 want_cookie = tcp_syn_flood_action(sk, rsk_ops->slab_name);
    6777           0 :                 if (!want_cookie)
    6778           0 :                         goto drop;
    6779             :         }
    6780             : 
    6781           4 :         if (sk_acceptq_is_full(sk)) {
    6782           0 :                 NET_INC_STATS(sock_net(sk), LINUX_MIB_LISTENOVERFLOWS);
    6783           0 :                 goto drop;
    6784             :         }
    6785             : 
    6786           4 :         req = inet_reqsk_alloc(rsk_ops, sk, !want_cookie);
    6787           4 :         if (!req)
    6788           0 :                 goto drop;
    6789             : 
    6790           4 :         req->syncookie = want_cookie;
    6791           4 :         tcp_rsk(req)->af_specific = af_ops;
    6792           4 :         tcp_rsk(req)->ts_off = 0;
    6793             : #if IS_ENABLED(CONFIG_MPTCP)
    6794             :         tcp_rsk(req)->is_mptcp = 0;
    6795             : #endif
    6796             : 
    6797           4 :         tcp_clear_options(&tmp_opt);
    6798           4 :         tmp_opt.mss_clamp = af_ops->mss_clamp;
    6799           4 :         tmp_opt.user_mss  = tp->rx_opt.user_mss;
    6800           8 :         tcp_parse_options(sock_net(sk), skb, &tmp_opt, 0,
    6801             :                           want_cookie ? NULL : &foc);
    6802             : 
    6803           4 :         if (want_cookie && !tmp_opt.saw_tstamp)
    6804           0 :                 tcp_clear_options(&tmp_opt);
    6805             : 
    6806           4 :         if (IS_ENABLED(CONFIG_SMC) && want_cookie)
    6807             :                 tmp_opt.smc_ok = 0;
    6808             : 
    6809           4 :         tmp_opt.tstamp_ok = tmp_opt.saw_tstamp;
    6810           4 :         tcp_openreq_init(req, &tmp_opt, skb, sk);
    6811           4 :         inet_rsk(req)->no_srccheck = inet_sk(sk)->transparent;
    6812             : 
    6813             :         /* Note: tcp_v6_init_req() might override ir_iif for link locals */
    6814           4 :         inet_rsk(req)->ir_iif = inet_request_bound_dev_if(sk, skb);
    6815             : 
    6816           4 :         dst = af_ops->route_req(sk, skb, &fl, req);
    6817           4 :         if (!dst)
    6818           0 :                 goto drop_and_free;
    6819             : 
    6820           4 :         if (tmp_opt.tstamp_ok)
    6821           0 :                 tcp_rsk(req)->ts_off = af_ops->init_ts_off(net, skb);
    6822             : 
    6823           4 :         if (!want_cookie && !isn) {
    6824             :                 /* Kill the following clause, if you dislike this way. */
    6825           4 :                 if (!net->ipv4.sysctl_tcp_syncookies &&
    6826           0 :                     (net->ipv4.sysctl_max_syn_backlog - inet_csk_reqsk_queue_len(sk) <
    6827           0 :                      (net->ipv4.sysctl_max_syn_backlog >> 2)) &&
    6828           0 :                     !tcp_peer_is_proven(req, dst)) {
    6829             :                         /* Without syncookies last quarter of
    6830             :                          * backlog is filled with destinations,
    6831             :                          * proven to be alive.
    6832             :                          * It means that we continue to communicate
    6833             :                          * to destinations, already remembered
    6834             :                          * to the moment of synflood.
    6835             :                          */
    6836           0 :                         pr_drop_req(req, ntohs(tcp_hdr(skb)->source),
    6837             :                                     rsk_ops->family);
    6838           0 :                         goto drop_and_release;
    6839             :                 }
    6840             : 
    6841           4 :                 isn = af_ops->init_seq(skb);
    6842             :         }
    6843             : 
    6844           4 :         tcp_ecn_create_request(req, skb, sk, dst);
    6845             : 
    6846           4 :         if (want_cookie) {
    6847           0 :                 isn = cookie_init_sequence(af_ops, sk, skb, &req->mss);
    6848           0 :                 if (!tmp_opt.tstamp_ok)
    6849           0 :                         inet_rsk(req)->ecn_ok = 0;
    6850             :         }
    6851             : 
    6852           4 :         tcp_rsk(req)->snt_isn = isn;
    6853           4 :         tcp_rsk(req)->txhash = net_tx_rndhash();
    6854           4 :         tcp_rsk(req)->syn_tos = TCP_SKB_CB(skb)->ip_dsfield;
    6855           4 :         tcp_openreq_init_rwin(req, sk, dst);
    6856           4 :         sk_rx_queue_set(req_to_sk(req), skb);
    6857           4 :         if (!want_cookie) {
    6858           4 :                 tcp_reqsk_record_syn(sk, req, skb);
    6859           4 :                 fastopen_sk = tcp_try_fastopen(sk, skb, req, &foc, dst);
    6860             :         }
    6861           4 :         if (fastopen_sk) {
    6862           0 :                 af_ops->send_synack(fastopen_sk, dst, &fl, req,
    6863             :                                     &foc, TCP_SYNACK_FASTOPEN, skb);
    6864             :                 /* Add the child socket directly into the accept queue */
    6865           0 :                 if (!inet_csk_reqsk_queue_add(sk, req, fastopen_sk)) {
    6866           0 :                         reqsk_fastopen_remove(fastopen_sk, req, false);
    6867           0 :                         bh_unlock_sock(fastopen_sk);
    6868           0 :                         sock_put(fastopen_sk);
    6869           0 :                         goto drop_and_free;
    6870             :                 }
    6871           0 :                 sk->sk_data_ready(sk);
    6872           0 :                 bh_unlock_sock(fastopen_sk);
    6873           0 :                 sock_put(fastopen_sk);
    6874             :         } else {
    6875           4 :                 tcp_rsk(req)->tfo_listener = false;
    6876           4 :                 if (!want_cookie)
    6877           4 :                         inet_csk_reqsk_queue_hash_add(sk, req,
    6878           4 :                                 tcp_timeout_init((struct sock *)req));
    6879           4 :                 af_ops->send_synack(sk, dst, &fl, req, &foc,
    6880             :                                     !want_cookie ? TCP_SYNACK_NORMAL :
    6881             :                                                    TCP_SYNACK_COOKIE,
    6882             :                                     skb);
    6883           4 :                 if (want_cookie) {
    6884           0 :                         reqsk_free(req);
    6885           0 :                         return 0;
    6886             :                 }
    6887             :         }
    6888           4 :         reqsk_put(req);
    6889           4 :         return 0;
    6890             : 
    6891           0 : drop_and_release:
    6892           0 :         dst_release(dst);
    6893           0 : drop_and_free:
    6894           0 :         __reqsk_free(req);
    6895           0 : drop:
    6896           0 :         tcp_listendrop(sk);
    6897           0 :         return 0;
    6898             : }
    6899             : EXPORT_SYMBOL(tcp_conn_request);

Generated by: LCOV version 1.14