LCOV - code coverage report
Current view: top level - net/ipv4 - tcp.c (source / functions) Hit Total Coverage
Test: landlock.info Lines: 551 2204 25.0 %
Date: 2021-04-22 12:43:58 Functions: 32 89 36.0 %

          Line data    Source code
       1             : // SPDX-License-Identifier: GPL-2.0-or-later
       2             : /*
       3             :  * INET         An implementation of the TCP/IP protocol suite for the LINUX
       4             :  *              operating system.  INET is implemented using the  BSD Socket
       5             :  *              interface as the means of communication with the user level.
       6             :  *
       7             :  *              Implementation of the Transmission Control Protocol(TCP).
       8             :  *
       9             :  * Authors:     Ross Biro
      10             :  *              Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
      11             :  *              Mark Evans, <evansmp@uhura.aston.ac.uk>
      12             :  *              Corey Minyard <wf-rch!minyard@relay.EU.net>
      13             :  *              Florian La Roche, <flla@stud.uni-sb.de>
      14             :  *              Charles Hedrick, <hedrick@klinzhai.rutgers.edu>
      15             :  *              Linus Torvalds, <torvalds@cs.helsinki.fi>
      16             :  *              Alan Cox, <gw4pts@gw4pts.ampr.org>
      17             :  *              Matthew Dillon, <dillon@apollo.west.oic.com>
      18             :  *              Arnt Gulbrandsen, <agulbra@nvg.unit.no>
      19             :  *              Jorge Cwik, <jorge@laser.satlink.net>
      20             :  *
      21             :  * Fixes:
      22             :  *              Alan Cox        :       Numerous verify_area() calls
      23             :  *              Alan Cox        :       Set the ACK bit on a reset
      24             :  *              Alan Cox        :       Stopped it crashing if it closed while
      25             :  *                                      sk->inuse=1 and was trying to connect
      26             :  *                                      (tcp_err()).
      27             :  *              Alan Cox        :       All icmp error handling was broken
      28             :  *                                      pointers passed where wrong and the
      29             :  *                                      socket was looked up backwards. Nobody
      30             :  *                                      tested any icmp error code obviously.
      31             :  *              Alan Cox        :       tcp_err() now handled properly. It
      32             :  *                                      wakes people on errors. poll
      33             :  *                                      behaves and the icmp error race
      34             :  *                                      has gone by moving it into sock.c
      35             :  *              Alan Cox        :       tcp_send_reset() fixed to work for
      36             :  *                                      everything not just packets for
      37             :  *                                      unknown sockets.
      38             :  *              Alan Cox        :       tcp option processing.
      39             :  *              Alan Cox        :       Reset tweaked (still not 100%) [Had
      40             :  *                                      syn rule wrong]
      41             :  *              Herp Rosmanith  :       More reset fixes
      42             :  *              Alan Cox        :       No longer acks invalid rst frames.
      43             :  *                                      Acking any kind of RST is right out.
      44             :  *              Alan Cox        :       Sets an ignore me flag on an rst
      45             :  *                                      receive otherwise odd bits of prattle
      46             :  *                                      escape still
      47             :  *              Alan Cox        :       Fixed another acking RST frame bug.
      48             :  *                                      Should stop LAN workplace lockups.
      49             :  *              Alan Cox        :       Some tidyups using the new skb list
      50             :  *                                      facilities
      51             :  *              Alan Cox        :       sk->keepopen now seems to work
      52             :  *              Alan Cox        :       Pulls options out correctly on accepts
      53             :  *              Alan Cox        :       Fixed assorted sk->rqueue->next errors
      54             :  *              Alan Cox        :       PSH doesn't end a TCP read. Switched a
      55             :  *                                      bit to skb ops.
      56             :  *              Alan Cox        :       Tidied tcp_data to avoid a potential
      57             :  *                                      nasty.
      58             :  *              Alan Cox        :       Added some better commenting, as the
      59             :  *                                      tcp is hard to follow
      60             :  *              Alan Cox        :       Removed incorrect check for 20 * psh
      61             :  *      Michael O'Reilly        :       ack < copied bug fix.
      62             :  *      Johannes Stille         :       Misc tcp fixes (not all in yet).
      63             :  *              Alan Cox        :       FIN with no memory -> CRASH
      64             :  *              Alan Cox        :       Added socket option proto entries.
      65             :  *                                      Also added awareness of them to accept.
      66             :  *              Alan Cox        :       Added TCP options (SOL_TCP)
      67             :  *              Alan Cox        :       Switched wakeup calls to callbacks,
      68             :  *                                      so the kernel can layer network
      69             :  *                                      sockets.
      70             :  *              Alan Cox        :       Use ip_tos/ip_ttl settings.
      71             :  *              Alan Cox        :       Handle FIN (more) properly (we hope).
      72             :  *              Alan Cox        :       RST frames sent on unsynchronised
      73             :  *                                      state ack error.
      74             :  *              Alan Cox        :       Put in missing check for SYN bit.
      75             :  *              Alan Cox        :       Added tcp_select_window() aka NET2E
      76             :  *                                      window non shrink trick.
      77             :  *              Alan Cox        :       Added a couple of small NET2E timer
      78             :  *                                      fixes
      79             :  *              Charles Hedrick :       TCP fixes
      80             :  *              Toomas Tamm     :       TCP window fixes
      81             :  *              Alan Cox        :       Small URG fix to rlogin ^C ack fight
      82             :  *              Charles Hedrick :       Rewrote most of it to actually work
      83             :  *              Linus           :       Rewrote tcp_read() and URG handling
      84             :  *                                      completely
      85             :  *              Gerhard Koerting:       Fixed some missing timer handling
      86             :  *              Matthew Dillon  :       Reworked TCP machine states as per RFC
      87             :  *              Gerhard Koerting:       PC/TCP workarounds
      88             :  *              Adam Caldwell   :       Assorted timer/timing errors
      89             :  *              Matthew Dillon  :       Fixed another RST bug
      90             :  *              Alan Cox        :       Move to kernel side addressing changes.
      91             :  *              Alan Cox        :       Beginning work on TCP fastpathing
      92             :  *                                      (not yet usable)
      93             :  *              Arnt Gulbrandsen:       Turbocharged tcp_check() routine.
      94             :  *              Alan Cox        :       TCP fast path debugging
      95             :  *              Alan Cox        :       Window clamping
      96             :  *              Michael Riepe   :       Bug in tcp_check()
      97             :  *              Matt Dillon     :       More TCP improvements and RST bug fixes
      98             :  *              Matt Dillon     :       Yet more small nasties remove from the
      99             :  *                                      TCP code (Be very nice to this man if
     100             :  *                                      tcp finally works 100%) 8)
     101             :  *              Alan Cox        :       BSD accept semantics.
     102             :  *              Alan Cox        :       Reset on closedown bug.
     103             :  *      Peter De Schrijver      :       ENOTCONN check missing in tcp_sendto().
     104             :  *              Michael Pall    :       Handle poll() after URG properly in
     105             :  *                                      all cases.
     106             :  *              Michael Pall    :       Undo the last fix in tcp_read_urg()
     107             :  *                                      (multi URG PUSH broke rlogin).
     108             :  *              Michael Pall    :       Fix the multi URG PUSH problem in
     109             :  *                                      tcp_readable(), poll() after URG
     110             :  *                                      works now.
     111             :  *              Michael Pall    :       recv(...,MSG_OOB) never blocks in the
     112             :  *                                      BSD api.
     113             :  *              Alan Cox        :       Changed the semantics of sk->socket to
     114             :  *                                      fix a race and a signal problem with
     115             :  *                                      accept() and async I/O.
     116             :  *              Alan Cox        :       Relaxed the rules on tcp_sendto().
     117             :  *              Yury Shevchuk   :       Really fixed accept() blocking problem.
     118             :  *              Craig I. Hagan  :       Allow for BSD compatible TIME_WAIT for
     119             :  *                                      clients/servers which listen in on
     120             :  *                                      fixed ports.
     121             :  *              Alan Cox        :       Cleaned the above up and shrank it to
     122             :  *                                      a sensible code size.
     123             :  *              Alan Cox        :       Self connect lockup fix.
     124             :  *              Alan Cox        :       No connect to multicast.
     125             :  *              Ross Biro       :       Close unaccepted children on master
     126             :  *                                      socket close.
     127             :  *              Alan Cox        :       Reset tracing code.
     128             :  *              Alan Cox        :       Spurious resets on shutdown.
     129             :  *              Alan Cox        :       Giant 15 minute/60 second timer error
     130             :  *              Alan Cox        :       Small whoops in polling before an
     131             :  *                                      accept.
     132             :  *              Alan Cox        :       Kept the state trace facility since
     133             :  *                                      it's handy for debugging.
     134             :  *              Alan Cox        :       More reset handler fixes.
     135             :  *              Alan Cox        :       Started rewriting the code based on
     136             :  *                                      the RFC's for other useful protocol
     137             :  *                                      references see: Comer, KA9Q NOS, and
     138             :  *                                      for a reference on the difference
     139             :  *                                      between specifications and how BSD
     140             :  *                                      works see the 4.4lite source.
     141             :  *              A.N.Kuznetsov   :       Don't time wait on completion of tidy
     142             :  *                                      close.
     143             :  *              Linus Torvalds  :       Fin/Shutdown & copied_seq changes.
     144             :  *              Linus Torvalds  :       Fixed BSD port reuse to work first syn
     145             :  *              Alan Cox        :       Reimplemented timers as per the RFC
     146             :  *                                      and using multiple timers for sanity.
     147             :  *              Alan Cox        :       Small bug fixes, and a lot of new
     148             :  *                                      comments.
     149             :  *              Alan Cox        :       Fixed dual reader crash by locking
     150             :  *                                      the buffers (much like datagram.c)
     151             :  *              Alan Cox        :       Fixed stuck sockets in probe. A probe
     152             :  *                                      now gets fed up of retrying without
     153             :  *                                      (even a no space) answer.
     154             :  *              Alan Cox        :       Extracted closing code better
     155             :  *              Alan Cox        :       Fixed the closing state machine to
     156             :  *                                      resemble the RFC.
     157             :  *              Alan Cox        :       More 'per spec' fixes.
     158             :  *              Jorge Cwik      :       Even faster checksumming.
     159             :  *              Alan Cox        :       tcp_data() doesn't ack illegal PSH
     160             :  *                                      only frames. At least one pc tcp stack
     161             :  *                                      generates them.
     162             :  *              Alan Cox        :       Cache last socket.
     163             :  *              Alan Cox        :       Per route irtt.
     164             :  *              Matt Day        :       poll()->select() match BSD precisely on error
     165             :  *              Alan Cox        :       New buffers
     166             :  *              Marc Tamsky     :       Various sk->prot->retransmits and
     167             :  *                                      sk->retransmits misupdating fixed.
     168             :  *                                      Fixed tcp_write_timeout: stuck close,
     169             :  *                                      and TCP syn retries gets used now.
     170             :  *              Mark Yarvis     :       In tcp_read_wakeup(), don't send an
     171             :  *                                      ack if state is TCP_CLOSED.
     172             :  *              Alan Cox        :       Look up device on a retransmit - routes may
     173             :  *                                      change. Doesn't yet cope with MSS shrink right
     174             :  *                                      but it's a start!
     175             :  *              Marc Tamsky     :       Closing in closing fixes.
     176             :  *              Mike Shaver     :       RFC1122 verifications.
     177             :  *              Alan Cox        :       rcv_saddr errors.
     178             :  *              Alan Cox        :       Block double connect().
     179             :  *              Alan Cox        :       Small hooks for enSKIP.
     180             :  *              Alexey Kuznetsov:       Path MTU discovery.
     181             :  *              Alan Cox        :       Support soft errors.
     182             :  *              Alan Cox        :       Fix MTU discovery pathological case
     183             :  *                                      when the remote claims no mtu!
     184             :  *              Marc Tamsky     :       TCP_CLOSE fix.
     185             :  *              Colin (G3TNE)   :       Send a reset on syn ack replies in
     186             :  *                                      window but wrong (fixes NT lpd problems)
     187             :  *              Pedro Roque     :       Better TCP window handling, delayed ack.
     188             :  *              Joerg Reuter    :       No modification of locked buffers in
     189             :  *                                      tcp_do_retransmit()
     190             :  *              Eric Schenk     :       Changed receiver side silly window
     191             :  *                                      avoidance algorithm to BSD style
     192             :  *                                      algorithm. This doubles throughput
     193             :  *                                      against machines running Solaris,
     194             :  *                                      and seems to result in general
     195             :  *                                      improvement.
     196             :  *      Stefan Magdalinski      :       adjusted tcp_readable() to fix FIONREAD
     197             :  *      Willy Konynenberg       :       Transparent proxying support.
     198             :  *      Mike McLagan            :       Routing by source
     199             :  *              Keith Owens     :       Do proper merging with partial SKB's in
     200             :  *                                      tcp_do_sendmsg to avoid burstiness.
     201             :  *              Eric Schenk     :       Fix fast close down bug with
     202             :  *                                      shutdown() followed by close().
     203             :  *              Andi Kleen      :       Make poll agree with SIGIO
     204             :  *      Salvatore Sanfilippo    :       Support SO_LINGER with linger == 1 and
     205             :  *                                      lingertime == 0 (RFC 793 ABORT Call)
     206             :  *      Hirokazu Takahashi      :       Use copy_from_user() instead of
     207             :  *                                      csum_and_copy_from_user() if possible.
     208             :  *
     209             :  * Description of States:
     210             :  *
     211             :  *      TCP_SYN_SENT            sent a connection request, waiting for ack
     212             :  *
     213             :  *      TCP_SYN_RECV            received a connection request, sent ack,
     214             :  *                              waiting for final ack in three-way handshake.
     215             :  *
     216             :  *      TCP_ESTABLISHED         connection established
     217             :  *
     218             :  *      TCP_FIN_WAIT1           our side has shutdown, waiting to complete
     219             :  *                              transmission of remaining buffered data
     220             :  *
     221             :  *      TCP_FIN_WAIT2           all buffered data sent, waiting for remote
     222             :  *                              to shutdown
     223             :  *
     224             :  *      TCP_CLOSING             both sides have shutdown but we still have
     225             :  *                              data we have to finish sending
     226             :  *
     227             :  *      TCP_TIME_WAIT           timeout to catch resent junk before entering
     228             :  *                              closed, can only be entered from FIN_WAIT2
     229             :  *                              or CLOSING.  Required because the other end
     230             :  *                              may not have gotten our last ACK causing it
     231             :  *                              to retransmit the data packet (which we ignore)
     232             :  *
     233             :  *      TCP_CLOSE_WAIT          remote side has shutdown and is waiting for
     234             :  *                              us to finish writing our data and to shutdown
     235             :  *                              (we have to close() to move on to LAST_ACK)
     236             :  *
     237             :  *      TCP_LAST_ACK            out side has shutdown after remote has
     238             :  *                              shutdown.  There may still be data in our
     239             :  *                              buffer that we have to finish sending
     240             :  *
     241             :  *      TCP_CLOSE               socket is finished
     242             :  */
     243             : 
     244             : #define pr_fmt(fmt) "TCP: " fmt
     245             : 
     246             : #include <crypto/hash.h>
     247             : #include <linux/kernel.h>
     248             : #include <linux/module.h>
     249             : #include <linux/types.h>
     250             : #include <linux/fcntl.h>
     251             : #include <linux/poll.h>
     252             : #include <linux/inet_diag.h>
     253             : #include <linux/init.h>
     254             : #include <linux/fs.h>
     255             : #include <linux/skbuff.h>
     256             : #include <linux/scatterlist.h>
     257             : #include <linux/splice.h>
     258             : #include <linux/net.h>
     259             : #include <linux/socket.h>
     260             : #include <linux/random.h>
     261             : #include <linux/memblock.h>
     262             : #include <linux/highmem.h>
     263             : #include <linux/swap.h>
     264             : #include <linux/cache.h>
     265             : #include <linux/err.h>
     266             : #include <linux/time.h>
     267             : #include <linux/slab.h>
     268             : #include <linux/errqueue.h>
     269             : #include <linux/static_key.h>
     270             : 
     271             : #include <net/icmp.h>
     272             : #include <net/inet_common.h>
     273             : #include <net/tcp.h>
     274             : #include <net/mptcp.h>
     275             : #include <net/xfrm.h>
     276             : #include <net/ip.h>
     277             : #include <net/sock.h>
     278             : 
     279             : #include <linux/uaccess.h>
     280             : #include <asm/ioctls.h>
     281             : #include <net/busy_poll.h>
     282             : 
     283             : /* Track pending CMSGs. */
     284             : enum {
     285             :         TCP_CMSG_INQ = 1,
     286             :         TCP_CMSG_TS = 2
     287             : };
     288             : 
     289             : struct percpu_counter tcp_orphan_count;
     290             : EXPORT_SYMBOL_GPL(tcp_orphan_count);
     291             : 
     292             : long sysctl_tcp_mem[3] __read_mostly;
     293             : EXPORT_SYMBOL(sysctl_tcp_mem);
     294             : 
     295             : atomic_long_t tcp_memory_allocated;     /* Current allocated memory. */
     296             : EXPORT_SYMBOL(tcp_memory_allocated);
     297             : 
     298             : #if IS_ENABLED(CONFIG_SMC)
     299             : DEFINE_STATIC_KEY_FALSE(tcp_have_smc);
     300             : EXPORT_SYMBOL(tcp_have_smc);
     301             : #endif
     302             : 
     303             : /*
     304             :  * Current number of TCP sockets.
     305             :  */
     306             : struct percpu_counter tcp_sockets_allocated;
     307             : EXPORT_SYMBOL(tcp_sockets_allocated);
     308             : 
     309             : /*
     310             :  * TCP splice context
     311             :  */
     312             : struct tcp_splice_state {
     313             :         struct pipe_inode_info *pipe;
     314             :         size_t len;
     315             :         unsigned int flags;
     316             : };
     317             : 
     318             : /*
     319             :  * Pressure flag: try to collapse.
     320             :  * Technical note: it is used by multiple contexts non atomically.
     321             :  * All the __sk_mem_schedule() is of this nature: accounting
     322             :  * is strict, actions are advisory and have some latency.
     323             :  */
     324             : unsigned long tcp_memory_pressure __read_mostly;
     325             : EXPORT_SYMBOL_GPL(tcp_memory_pressure);
     326             : 
     327             : DEFINE_STATIC_KEY_FALSE(tcp_rx_skb_cache_key);
     328             : EXPORT_SYMBOL(tcp_rx_skb_cache_key);
     329             : 
     330             : DEFINE_STATIC_KEY_FALSE(tcp_tx_skb_cache_key);
     331             : 
     332           0 : void tcp_enter_memory_pressure(struct sock *sk)
     333             : {
     334           0 :         unsigned long val;
     335             : 
     336           0 :         if (READ_ONCE(tcp_memory_pressure))
     337             :                 return;
     338           0 :         val = jiffies;
     339             : 
     340           0 :         if (!val)
     341           0 :                 val--;
     342           0 :         if (!cmpxchg(&tcp_memory_pressure, 0, val))
     343           0 :                 NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPMEMORYPRESSURES);
     344             : }
     345             : EXPORT_SYMBOL_GPL(tcp_enter_memory_pressure);
     346             : 
     347          13 : void tcp_leave_memory_pressure(struct sock *sk)
     348             : {
     349          13 :         unsigned long val;
     350             : 
     351          13 :         if (!READ_ONCE(tcp_memory_pressure))
     352             :                 return;
     353           0 :         val = xchg(&tcp_memory_pressure, 0);
     354           0 :         if (val)
     355           0 :                 NET_ADD_STATS(sock_net(sk), LINUX_MIB_TCPMEMORYPRESSURESCHRONO,
     356             :                               jiffies_to_msecs(jiffies - val));
     357             : }
     358             : EXPORT_SYMBOL_GPL(tcp_leave_memory_pressure);
     359             : 
     360             : /* Convert seconds to retransmits based on initial and max timeout */
     361           0 : static u8 secs_to_retrans(int seconds, int timeout, int rto_max)
     362             : {
     363           0 :         u8 res = 0;
     364             : 
     365           0 :         if (seconds > 0) {
     366             :                 int period = timeout;
     367             : 
     368             :                 res = 1;
     369           0 :                 while (seconds > period && res < 255) {
     370           0 :                         res++;
     371           0 :                         timeout <<= 1;
     372           0 :                         if (timeout > rto_max)
     373             :                                 timeout = rto_max;
     374           0 :                         period += timeout;
     375             :                 }
     376             :         }
     377           0 :         return res;
     378             : }
     379             : 
     380             : /* Convert retransmits to seconds based on initial and max timeout */
     381           0 : static int retrans_to_secs(u8 retrans, int timeout, int rto_max)
     382             : {
     383           0 :         int period = 0;
     384             : 
     385           0 :         if (retrans > 0) {
     386             :                 period = timeout;
     387           0 :                 while (--retrans) {
     388           0 :                         timeout <<= 1;
     389           0 :                         if (timeout > rto_max)
     390             :                                 timeout = rto_max;
     391           0 :                         period += timeout;
     392             :                 }
     393             :         }
     394           0 :         return period;
     395             : }
     396             : 
     397           0 : static u64 tcp_compute_delivery_rate(const struct tcp_sock *tp)
     398             : {
     399           0 :         u32 rate = READ_ONCE(tp->rate_delivered);
     400           0 :         u32 intv = READ_ONCE(tp->rate_interval_us);
     401           0 :         u64 rate64 = 0;
     402             : 
     403           0 :         if (rate && intv) {
     404           0 :                 rate64 = (u64)rate * tp->mss_cache * USEC_PER_SEC;
     405           0 :                 do_div(rate64, intv);
     406             :         }
     407           0 :         return rate64;
     408             : }
     409             : 
     410             : /* Address-family independent initialization for a tcp_sock.
     411             :  *
     412             :  * NOTE: A lot of things set to zero explicitly by call to
     413             :  *       sk_alloc() so need not be done here.
     414             :  */
     415           3 : void tcp_init_sock(struct sock *sk)
     416             : {
     417           3 :         struct inet_connection_sock *icsk = inet_csk(sk);
     418           3 :         struct tcp_sock *tp = tcp_sk(sk);
     419             : 
     420           3 :         tp->out_of_order_queue = RB_ROOT;
     421           3 :         sk->tcp_rtx_queue = RB_ROOT;
     422           3 :         tcp_init_xmit_timers(sk);
     423           3 :         INIT_LIST_HEAD(&tp->tsq_node);
     424           3 :         INIT_LIST_HEAD(&tp->tsorted_sent_queue);
     425             : 
     426           3 :         icsk->icsk_rto = TCP_TIMEOUT_INIT;
     427           3 :         icsk->icsk_rto_min = TCP_RTO_MIN;
     428           3 :         icsk->icsk_delack_max = TCP_DELACK_MAX;
     429           3 :         tp->mdev_us = jiffies_to_usecs(TCP_TIMEOUT_INIT);
     430           3 :         minmax_reset(&tp->rtt_min, tcp_jiffies32, ~0U);
     431             : 
     432             :         /* So many TCP implementations out there (incorrectly) count the
     433             :          * initial SYN frame in their delayed-ACK and congestion control
     434             :          * algorithms that we must have the following bandaid to talk
     435             :          * efficiently to them.  -DaveM
     436             :          */
     437           3 :         tp->snd_cwnd = TCP_INIT_CWND;
     438             : 
     439             :         /* There's a bubble in the pipe until at least the first ACK. */
     440           3 :         tp->app_limited = ~0U;
     441             : 
     442             :         /* See draft-stevens-tcpca-spec-01 for discussion of the
     443             :          * initialization of these values.
     444             :          */
     445           3 :         tp->snd_ssthresh = TCP_INFINITE_SSTHRESH;
     446           3 :         tp->snd_cwnd_clamp = ~0;
     447           3 :         tp->mss_cache = TCP_MSS_DEFAULT;
     448             : 
     449           3 :         tp->reordering = sock_net(sk)->ipv4.sysctl_tcp_reordering;
     450           3 :         tcp_assign_congestion_control(sk);
     451             : 
     452           3 :         tp->tsoffset = 0;
     453           3 :         tp->rack.reo_wnd_steps = 1;
     454             : 
     455           3 :         sk->sk_write_space = sk_stream_write_space;
     456           3 :         sock_set_flag(sk, SOCK_USE_WRITE_QUEUE);
     457             : 
     458           3 :         icsk->icsk_sync_mss = tcp_sync_mss;
     459             : 
     460           3 :         WRITE_ONCE(sk->sk_sndbuf, sock_net(sk)->ipv4.sysctl_tcp_wmem[1]);
     461           3 :         WRITE_ONCE(sk->sk_rcvbuf, sock_net(sk)->ipv4.sysctl_tcp_rmem[1]);
     462             : 
     463           3 :         sk_sockets_allocated_inc(sk);
     464           3 :         sk->sk_route_forced_caps = NETIF_F_GSO;
     465           3 : }
     466             : EXPORT_SYMBOL(tcp_init_sock);
     467             : 
     468         411 : static void tcp_tx_timestamp(struct sock *sk, u16 tsflags)
     469             : {
     470         411 :         struct sk_buff *skb = tcp_write_queue_tail(sk);
     471             : 
     472         411 :         if (tsflags && skb) {
     473           0 :                 struct skb_shared_info *shinfo = skb_shinfo(skb);
     474           0 :                 struct tcp_skb_cb *tcb = TCP_SKB_CB(skb);
     475             : 
     476           0 :                 sock_tx_timestamp(sk, tsflags, &shinfo->tx_flags);
     477           0 :                 if (tsflags & SOF_TIMESTAMPING_TX_ACK)
     478           0 :                         tcb->txstamp_ack = 1;
     479           0 :                 if (tsflags & SOF_TIMESTAMPING_TX_RECORD_MASK)
     480           0 :                         shinfo->tskey = TCP_SKB_CB(skb)->seq + skb->len - 1;
     481             :         }
     482         411 : }
     483             : 
     484         685 : static bool tcp_stream_is_readable(struct sock *sk, int target)
     485             : {
     486         685 :         if (tcp_epollin_ready(sk, target))
     487             :                 return true;
     488             : 
     489         612 :         if (sk->sk_prot->stream_memory_read)
     490           0 :                 return sk->sk_prot->stream_memory_read(sk);
     491             :         return false;
     492             : }
     493             : 
     494             : /*
     495             :  *      Wait for a TCP event.
     496             :  *
     497             :  *      Note that we don't need to lock the socket, as the upper poll layers
     498             :  *      take care of normal races (between the test and the event) and we don't
     499             :  *      go look at any of the socket buffers directly.
     500             :  */
     501         714 : __poll_t tcp_poll(struct file *file, struct socket *sock, poll_table *wait)
     502             : {
     503         714 :         __poll_t mask;
     504         714 :         struct sock *sk = sock->sk;
     505         714 :         const struct tcp_sock *tp = tcp_sk(sk);
     506         714 :         int state;
     507             : 
     508         714 :         sock_poll_wait(file, sock, wait);
     509             : 
     510         714 :         state = inet_sk_state_load(sk);
     511         714 :         if (state == TCP_LISTEN)
     512          54 :                 return inet_csk_listen_poll(sk);
     513             : 
     514             :         /* Socket is not locked. We are protected from async events
     515             :          * by poll logic and correct handling of state changes
     516             :          * made by other threads is impossible in any case.
     517             :          */
     518             : 
     519         685 :         mask = 0;
     520             : 
     521             :         /*
     522             :          * EPOLLHUP is certainly not done right. But poll() doesn't
     523             :          * have a notion of HUP in just one direction, and for a
     524             :          * socket the read side is more interesting.
     525             :          *
     526             :          * Some poll() documentation says that EPOLLHUP is incompatible
     527             :          * with the EPOLLOUT/POLLWR flags, so somebody should check this
     528             :          * all. But careful, it tends to be safer to return too many
     529             :          * bits than too few, and you can easily break real applications
     530             :          * if you don't tell them that something has hung up!
     531             :          *
     532             :          * Check-me.
     533             :          *
     534             :          * Check number 1. EPOLLHUP is _UNMASKABLE_ event (see UNIX98 and
     535             :          * our fs/select.c). It means that after we received EOF,
     536             :          * poll always returns immediately, making impossible poll() on write()
     537             :          * in state CLOSE_WAIT. One solution is evident --- to set EPOLLHUP
     538             :          * if and only if shutdown has been made in both directions.
     539             :          * Actually, it is interesting to look how Solaris and DUX
     540             :          * solve this dilemma. I would prefer, if EPOLLHUP were maskable,
     541             :          * then we could set it on SND_SHUTDOWN. BTW examples given
     542             :          * in Stevens' books assume exactly this behaviour, it explains
     543             :          * why EPOLLHUP is incompatible with EPOLLOUT.  --ANK
     544             :          *
     545             :          * NOTE. Check for TCP_CLOSE is added. The goal is to prevent
     546             :          * blocking on fresh not-connected or disconnected socket. --ANK
     547             :          */
     548         685 :         if (sk->sk_shutdown == SHUTDOWN_MASK || state == TCP_CLOSE)
     549           0 :                 mask |= EPOLLHUP;
     550         685 :         if (sk->sk_shutdown & RCV_SHUTDOWN)
     551           1 :                 mask |= EPOLLIN | EPOLLRDNORM | EPOLLRDHUP;
     552             : 
     553             :         /* Connected or passive Fast Open socket? */
     554         685 :         if (state != TCP_SYN_SENT &&
     555           0 :             (state != TCP_SYN_RECV || rcu_access_pointer(tp->fastopen_rsk))) {
     556         685 :                 int target = sock_rcvlowat(sk, 0, INT_MAX);
     557             : 
     558         685 :                 if (READ_ONCE(tp->urg_seq) == READ_ONCE(tp->copied_seq) &&
     559           0 :                     !sock_flag(sk, SOCK_URGINLINE) &&
     560           0 :                     tp->urg_data)
     561           0 :                         target++;
     562             : 
     563         685 :                 if (tcp_stream_is_readable(sk, target))
     564          73 :                         mask |= EPOLLIN | EPOLLRDNORM;
     565             : 
     566         685 :                 if (!(sk->sk_shutdown & SEND_SHUTDOWN)) {
     567         685 :                         if (__sk_stream_is_writeable(sk, 1)) {
     568         685 :                                 mask |= EPOLLOUT | EPOLLWRNORM;
     569             :                         } else {  /* send SIGIO later */
     570           0 :                                 sk_set_bit(SOCKWQ_ASYNC_NOSPACE, sk);
     571           0 :                                 set_bit(SOCK_NOSPACE, &sk->sk_socket->flags);
     572             : 
     573             :                                 /* Race breaker. If space is freed after
     574             :                                  * wspace test but before the flags are set,
     575             :                                  * IO signal will be lost. Memory barrier
     576             :                                  * pairs with the input side.
     577             :                                  */
     578           0 :                                 smp_mb__after_atomic();
     579           0 :                                 if (__sk_stream_is_writeable(sk, 1))
     580           0 :                                         mask |= EPOLLOUT | EPOLLWRNORM;
     581             :                         }
     582             :                 } else
     583           0 :                         mask |= EPOLLOUT | EPOLLWRNORM;
     584             : 
     585         685 :                 if (tp->urg_data & TCP_URG_VALID)
     586           0 :                         mask |= EPOLLPRI;
     587           0 :         } else if (state == TCP_SYN_SENT && inet_sk(sk)->defer_connect) {
     588             :                 /* Active TCP fastopen socket with defer_connect
     589             :                  * Return EPOLLOUT so application can call write()
     590             :                  * in order for kernel to generate SYN+data
     591             :                  */
     592           0 :                 mask |= EPOLLOUT | EPOLLWRNORM;
     593             :         }
     594             :         /* This barrier is coupled with smp_wmb() in tcp_reset() */
     595         685 :         smp_rmb();
     596         685 :         if (sk->sk_err || !skb_queue_empty_lockless(&sk->sk_error_queue))
     597           0 :                 mask |= EPOLLERR;
     598             : 
     599             :         return mask;
     600             : }
     601             : EXPORT_SYMBOL(tcp_poll);
     602             : 
     603           0 : int tcp_ioctl(struct sock *sk, int cmd, unsigned long arg)
     604             : {
     605           0 :         struct tcp_sock *tp = tcp_sk(sk);
     606           0 :         int answ;
     607           0 :         bool slow;
     608             : 
     609           0 :         switch (cmd) {
     610           0 :         case SIOCINQ:
     611           0 :                 if (sk->sk_state == TCP_LISTEN)
     612             :                         return -EINVAL;
     613             : 
     614           0 :                 slow = lock_sock_fast(sk);
     615           0 :                 answ = tcp_inq(sk);
     616           0 :                 unlock_sock_fast(sk, slow);
     617           0 :                 break;
     618           0 :         case SIOCATMARK:
     619           0 :                 answ = tp->urg_data &&
     620           0 :                        READ_ONCE(tp->urg_seq) == READ_ONCE(tp->copied_seq);
     621             :                 break;
     622           0 :         case SIOCOUTQ:
     623           0 :                 if (sk->sk_state == TCP_LISTEN)
     624             :                         return -EINVAL;
     625             : 
     626           0 :                 if ((1 << sk->sk_state) & (TCPF_SYN_SENT | TCPF_SYN_RECV))
     627             :                         answ = 0;
     628             :                 else
     629           0 :                         answ = READ_ONCE(tp->write_seq) - tp->snd_una;
     630             :                 break;
     631           0 :         case SIOCOUTQNSD:
     632           0 :                 if (sk->sk_state == TCP_LISTEN)
     633             :                         return -EINVAL;
     634             : 
     635           0 :                 if ((1 << sk->sk_state) & (TCPF_SYN_SENT | TCPF_SYN_RECV))
     636             :                         answ = 0;
     637             :                 else
     638           0 :                         answ = READ_ONCE(tp->write_seq) -
     639           0 :                                READ_ONCE(tp->snd_nxt);
     640             :                 break;
     641             :         default:
     642             :                 return -ENOIOCTLCMD;
     643             :         }
     644             : 
     645           0 :         return put_user(answ, (int __user *)arg);
     646             : }
     647             : EXPORT_SYMBOL(tcp_ioctl);
     648             : 
     649         411 : static inline void tcp_mark_push(struct tcp_sock *tp, struct sk_buff *skb)
     650             : {
     651         411 :         TCP_SKB_CB(skb)->tcp_flags |= TCPHDR_PSH;
     652         411 :         tp->pushed_seq = tp->write_seq;
     653         411 : }
     654             : 
     655           0 : static inline bool forced_push(const struct tcp_sock *tp)
     656             : {
     657           0 :         return after(tp->write_seq, tp->pushed_seq + (tp->max_window >> 1));
     658             : }
     659             : 
     660         361 : static void skb_entail(struct sock *sk, struct sk_buff *skb)
     661             : {
     662         361 :         struct tcp_sock *tp = tcp_sk(sk);
     663         361 :         struct tcp_skb_cb *tcb = TCP_SKB_CB(skb);
     664             : 
     665         361 :         skb->csum    = 0;
     666         361 :         tcb->seq     = tcb->end_seq = tp->write_seq;
     667         361 :         tcb->tcp_flags = TCPHDR_ACK;
     668         361 :         tcb->sacked  = 0;
     669         361 :         __skb_header_release(skb);
     670         361 :         tcp_add_write_queue_tail(sk, skb);
     671         361 :         sk_wmem_queued_add(sk, skb->truesize);
     672         361 :         sk_mem_charge(sk, skb->truesize);
     673         361 :         if (tp->nonagle & TCP_NAGLE_PUSH)
     674           4 :                 tp->nonagle &= ~TCP_NAGLE_PUSH;
     675             : 
     676         361 :         tcp_slow_start_after_idle_check(sk);
     677         361 : }
     678             : 
     679         411 : static inline void tcp_mark_urg(struct tcp_sock *tp, int flags)
     680             : {
     681         411 :         if (flags & MSG_OOB)
     682           0 :                 tp->snd_up = tp->write_seq;
     683             : }
     684             : 
     685             : /* If a not yet filled skb is pushed, do not send it if
     686             :  * we have data packets in Qdisc or NIC queues :
     687             :  * Because TX completion will happen shortly, it gives a chance
     688             :  * to coalesce future sendmsg() payload into this skb, without
     689             :  * need for a timer, and with no latency trade off.
     690             :  * As packets containing data payload have a bigger truesize
     691             :  * than pure acks (dataless) packets, the last checks prevent
     692             :  * autocorking if we only have an ACK in Qdisc/NIC queues,
     693             :  * or if TX completion was delayed after we processed ACK packet.
     694             :  */
     695         411 : static bool tcp_should_autocork(struct sock *sk, struct sk_buff *skb,
     696             :                                 int size_goal)
     697             : {
     698         822 :         return skb->len < size_goal &&
     699         411 :                sock_net(sk)->ipv4.sysctl_tcp_autocorking &&
     700         822 :                !tcp_rtx_queue_empty(sk) &&
     701         194 :                refcount_read(&sk->sk_wmem_alloc) > skb->truesize;
     702             : }
     703             : 
     704         411 : void tcp_push(struct sock *sk, int flags, int mss_now,
     705             :               int nonagle, int size_goal)
     706             : {
     707         411 :         struct tcp_sock *tp = tcp_sk(sk);
     708         411 :         struct sk_buff *skb;
     709             : 
     710         411 :         skb = tcp_write_queue_tail(sk);
     711         411 :         if (!skb)
     712             :                 return;
     713         411 :         if (!(flags & MSG_MORE) || forced_push(tp))
     714         411 :                 tcp_mark_push(tp, skb);
     715             : 
     716         411 :         tcp_mark_urg(tp, flags);
     717             : 
     718         411 :         if (tcp_should_autocork(sk, skb, size_goal)) {
     719             : 
     720             :                 /* avoid atomic op if TSQ_THROTTLED bit is already set */
     721          74 :                 if (!test_bit(TSQ_THROTTLED, &sk->sk_tsq_flags)) {
     722          58 :                         NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPAUTOCORKING);
     723          58 :                         set_bit(TSQ_THROTTLED, &sk->sk_tsq_flags);
     724             :                 }
     725             :                 /* It is possible TX completion already happened
     726             :                  * before we set TSQ_THROTTLED.
     727             :                  */
     728          74 :                 if (refcount_read(&sk->sk_wmem_alloc) > skb->truesize)
     729             :                         return;
     730             :         }
     731             : 
     732         337 :         if (flags & MSG_MORE)
     733           0 :                 nonagle = TCP_NAGLE_CORK;
     734             : 
     735         337 :         __tcp_push_pending_frames(sk, mss_now, nonagle);
     736             : }
     737             : 
     738           0 : static int tcp_splice_data_recv(read_descriptor_t *rd_desc, struct sk_buff *skb,
     739             :                                 unsigned int offset, size_t len)
     740             : {
     741           0 :         struct tcp_splice_state *tss = rd_desc->arg.data;
     742           0 :         int ret;
     743             : 
     744           0 :         ret = skb_splice_bits(skb, skb->sk, offset, tss->pipe,
     745           0 :                               min(rd_desc->count, len), tss->flags);
     746           0 :         if (ret > 0)
     747           0 :                 rd_desc->count -= ret;
     748           0 :         return ret;
     749             : }
     750             : 
     751           0 : static int __tcp_splice_read(struct sock *sk, struct tcp_splice_state *tss)
     752             : {
     753             :         /* Store TCP splice context information in read_descriptor_t. */
     754           0 :         read_descriptor_t rd_desc = {
     755             :                 .arg.data = tss,
     756           0 :                 .count    = tss->len,
     757             :         };
     758             : 
     759           0 :         return tcp_read_sock(sk, &rd_desc, tcp_splice_data_recv);
     760             : }
     761             : 
     762             : /**
     763             :  *  tcp_splice_read - splice data from TCP socket to a pipe
     764             :  * @sock:       socket to splice from
     765             :  * @ppos:       position (not valid)
     766             :  * @pipe:       pipe to splice to
     767             :  * @len:        number of bytes to splice
     768             :  * @flags:      splice modifier flags
     769             :  *
     770             :  * Description:
     771             :  *    Will read pages from given socket and fill them into a pipe.
     772             :  *
     773             :  **/
     774           0 : ssize_t tcp_splice_read(struct socket *sock, loff_t *ppos,
     775             :                         struct pipe_inode_info *pipe, size_t len,
     776             :                         unsigned int flags)
     777             : {
     778           0 :         struct sock *sk = sock->sk;
     779           0 :         struct tcp_splice_state tss = {
     780             :                 .pipe = pipe,
     781             :                 .len = len,
     782             :                 .flags = flags,
     783             :         };
     784           0 :         long timeo;
     785           0 :         ssize_t spliced;
     786           0 :         int ret;
     787             : 
     788           0 :         sock_rps_record_flow(sk);
     789             :         /*
     790             :          * We can't seek on a socket input
     791             :          */
     792           0 :         if (unlikely(*ppos))
     793             :                 return -ESPIPE;
     794             : 
     795           0 :         ret = spliced = 0;
     796             : 
     797           0 :         lock_sock(sk);
     798             : 
     799           0 :         timeo = sock_rcvtimeo(sk, sock->file->f_flags & O_NONBLOCK);
     800           0 :         while (tss.len) {
     801           0 :                 ret = __tcp_splice_read(sk, &tss);
     802           0 :                 if (ret < 0)
     803             :                         break;
     804           0 :                 else if (!ret) {
     805           0 :                         if (spliced)
     806             :                                 break;
     807           0 :                         if (sock_flag(sk, SOCK_DONE))
     808             :                                 break;
     809           0 :                         if (sk->sk_err) {
     810           0 :                                 ret = sock_error(sk);
     811           0 :                                 break;
     812             :                         }
     813           0 :                         if (sk->sk_shutdown & RCV_SHUTDOWN)
     814             :                                 break;
     815           0 :                         if (sk->sk_state == TCP_CLOSE) {
     816             :                                 /*
     817             :                                  * This occurs when user tries to read
     818             :                                  * from never connected socket.
     819             :                                  */
     820             :                                 ret = -ENOTCONN;
     821             :                                 break;
     822             :                         }
     823           0 :                         if (!timeo) {
     824             :                                 ret = -EAGAIN;
     825             :                                 break;
     826             :                         }
     827             :                         /* if __tcp_splice_read() got nothing while we have
     828             :                          * an skb in receive queue, we do not want to loop.
     829             :                          * This might happen with URG data.
     830             :                          */
     831           0 :                         if (!skb_queue_empty(&sk->sk_receive_queue))
     832             :                                 break;
     833           0 :                         sk_wait_data(sk, &timeo, NULL);
     834           0 :                         if (signal_pending(current)) {
     835           0 :                                 ret = sock_intr_errno(timeo);
     836             :                                 break;
     837             :                         }
     838           0 :                         continue;
     839             :                 }
     840           0 :                 tss.len -= ret;
     841           0 :                 spliced += ret;
     842             : 
     843           0 :                 if (!timeo)
     844             :                         break;
     845           0 :                 release_sock(sk);
     846           0 :                 lock_sock(sk);
     847             : 
     848           0 :                 if (sk->sk_err || sk->sk_state == TCP_CLOSE ||
     849           0 :                     (sk->sk_shutdown & RCV_SHUTDOWN) ||
     850           0 :                     signal_pending(current))
     851             :                         break;
     852             :         }
     853             : 
     854           0 :         release_sock(sk);
     855             : 
     856           0 :         if (spliced)
     857             :                 return spliced;
     858             : 
     859           0 :         return ret;
     860             : }
     861             : EXPORT_SYMBOL(tcp_splice_read);
     862             : 
     863         361 : struct sk_buff *sk_stream_alloc_skb(struct sock *sk, int size, gfp_t gfp,
     864             :                                     bool force_schedule)
     865             : {
     866         361 :         struct sk_buff *skb;
     867             : 
     868         361 :         if (likely(!size)) {
     869         361 :                 skb = sk->sk_tx_skb_cache;
     870         361 :                 if (skb) {
     871           0 :                         skb->truesize = SKB_TRUESIZE(skb_end_offset(skb));
     872           0 :                         sk->sk_tx_skb_cache = NULL;
     873           0 :                         pskb_trim(skb, 0);
     874           0 :                         INIT_LIST_HEAD(&skb->tcp_tsorted_anchor);
     875           0 :                         skb_shinfo(skb)->tx_flags = 0;
     876           0 :                         memset(TCP_SKB_CB(skb), 0, sizeof(struct tcp_skb_cb));
     877           0 :                         return skb;
     878             :                 }
     879             :         }
     880             :         /* The TCP header must be at least 32-bit aligned.  */
     881         361 :         size = ALIGN(size, 4);
     882             : 
     883         361 :         if (unlikely(tcp_under_memory_pressure(sk)))
     884           0 :                 sk_mem_reclaim_partial(sk);
     885             : 
     886         361 :         skb = alloc_skb_fclone(size + sk->sk_prot->max_header, gfp);
     887         361 :         if (likely(skb)) {
     888         361 :                 bool mem_scheduled;
     889             : 
     890         361 :                 if (force_schedule) {
     891         217 :                         mem_scheduled = true;
     892         217 :                         sk_forced_mem_schedule(sk, skb->truesize);
     893             :                 } else {
     894         144 :                         mem_scheduled = sk_wmem_schedule(sk, skb->truesize);
     895             :                 }
     896         361 :                 if (likely(mem_scheduled)) {
     897         361 :                         skb_reserve(skb, sk->sk_prot->max_header);
     898             :                         /*
     899             :                          * Make sure that we have exactly size bytes
     900             :                          * available to the caller, no more, no less.
     901             :                          */
     902         361 :                         skb->reserved_tailroom = skb->end - skb->tail - size;
     903         361 :                         INIT_LIST_HEAD(&skb->tcp_tsorted_anchor);
     904         361 :                         return skb;
     905             :                 }
     906           0 :                 __kfree_skb(skb);
     907             :         } else {
     908           0 :                 sk->sk_prot->enter_memory_pressure(sk);
     909           0 :                 sk_stream_moderate_sndbuf(sk);
     910             :         }
     911             :         return NULL;
     912             : }
     913             : 
     914         411 : static unsigned int tcp_xmit_size_goal(struct sock *sk, u32 mss_now,
     915             :                                        int large_allowed)
     916             : {
     917         411 :         struct tcp_sock *tp = tcp_sk(sk);
     918         411 :         u32 new_size_goal, size_goal;
     919             : 
     920         411 :         if (!large_allowed)
     921             :                 return mss_now;
     922             : 
     923             :         /* Note : tcp_tso_autosize() will eventually split this later */
     924         411 :         new_size_goal = sk->sk_gso_max_size - 1 - MAX_TCP_HEADER;
     925         411 :         new_size_goal = tcp_bound_to_half_wnd(tp, new_size_goal);
     926             : 
     927             :         /* We try hard to avoid divides here */
     928         411 :         size_goal = tp->gso_segs * mss_now;
     929         411 :         if (unlikely(new_size_goal < size_goal ||
     930             :                      new_size_goal >= size_goal + mss_now)) {
     931           4 :                 tp->gso_segs = min_t(u16, new_size_goal / mss_now,
     932             :                                      sk->sk_gso_max_segs);
     933           4 :                 size_goal = tp->gso_segs * mss_now;
     934             :         }
     935             : 
     936         411 :         return max(size_goal, mss_now);
     937             : }
     938             : 
     939         411 : int tcp_send_mss(struct sock *sk, int *size_goal, int flags)
     940             : {
     941         411 :         int mss_now;
     942             : 
     943         411 :         mss_now = tcp_current_mss(sk);
     944         411 :         *size_goal = tcp_xmit_size_goal(sk, mss_now, !(flags & MSG_OOB));
     945             : 
     946         411 :         return mss_now;
     947             : }
     948             : 
     949             : /* In some cases, both sendpage() and sendmsg() could have added
     950             :  * an skb to the write queue, but failed adding payload on it.
     951             :  * We need to remove it to consume less memory, but more
     952             :  * importantly be able to generate EPOLLOUT for Edge Trigger epoll()
     953             :  * users.
     954             :  */
     955           0 : void tcp_remove_empty_skb(struct sock *sk, struct sk_buff *skb)
     956             : {
     957           0 :         if (skb && !skb->len) {
     958           0 :                 tcp_unlink_write_queue(skb, sk);
     959           0 :                 if (tcp_write_queue_empty(sk))
     960           0 :                         tcp_chrono_stop(sk, TCP_CHRONO_BUSY);
     961           0 :                 sk_wmem_free_skb(sk, skb);
     962             :         }
     963           0 : }
     964             : 
     965           0 : struct sk_buff *tcp_build_frag(struct sock *sk, int size_goal, int flags,
     966             :                                struct page *page, int offset, size_t *size)
     967             : {
     968           0 :         struct sk_buff *skb = tcp_write_queue_tail(sk);
     969           0 :         struct tcp_sock *tp = tcp_sk(sk);
     970           0 :         bool can_coalesce;
     971           0 :         int copy, i;
     972             : 
     973           0 :         if (!skb || (copy = size_goal - skb->len) <= 0 ||
     974           0 :             !tcp_skb_can_collapse_to(skb)) {
     975           0 : new_segment:
     976           0 :                 if (!sk_stream_memory_free(sk))
     977             :                         return NULL;
     978             : 
     979           0 :                 skb = sk_stream_alloc_skb(sk, 0, sk->sk_allocation,
     980           0 :                                           tcp_rtx_and_write_queues_empty(sk));
     981           0 :                 if (!skb)
     982             :                         return NULL;
     983             : 
     984             : #ifdef CONFIG_TLS_DEVICE
     985             :                 skb->decrypted = !!(flags & MSG_SENDPAGE_DECRYPTED);
     986             : #endif
     987           0 :                 skb_entail(sk, skb);
     988           0 :                 copy = size_goal;
     989             :         }
     990             : 
     991           0 :         if (copy > *size)
     992           0 :                 copy = *size;
     993             : 
     994           0 :         i = skb_shinfo(skb)->nr_frags;
     995           0 :         can_coalesce = skb_can_coalesce(skb, i, page, offset);
     996           0 :         if (!can_coalesce && i >= sysctl_max_skb_frags) {
     997           0 :                 tcp_mark_push(tp, skb);
     998           0 :                 goto new_segment;
     999             :         }
    1000           0 :         if (!sk_wmem_schedule(sk, copy))
    1001             :                 return NULL;
    1002             : 
    1003           0 :         if (can_coalesce) {
    1004           0 :                 skb_frag_size_add(&skb_shinfo(skb)->frags[i - 1], copy);
    1005             :         } else {
    1006           0 :                 get_page(page);
    1007           0 :                 skb_fill_page_desc(skb, i, page, offset, copy);
    1008             :         }
    1009             : 
    1010           0 :         if (!(flags & MSG_NO_SHARED_FRAGS))
    1011           0 :                 skb_shinfo(skb)->flags |= SKBFL_SHARED_FRAG;
    1012             : 
    1013           0 :         skb->len += copy;
    1014           0 :         skb->data_len += copy;
    1015           0 :         skb->truesize += copy;
    1016           0 :         sk_wmem_queued_add(sk, copy);
    1017           0 :         sk_mem_charge(sk, copy);
    1018           0 :         skb->ip_summed = CHECKSUM_PARTIAL;
    1019           0 :         WRITE_ONCE(tp->write_seq, tp->write_seq + copy);
    1020           0 :         TCP_SKB_CB(skb)->end_seq += copy;
    1021           0 :         tcp_skb_pcount_set(skb, 0);
    1022             : 
    1023           0 :         *size = copy;
    1024           0 :         return skb;
    1025             : }
    1026             : 
    1027           0 : ssize_t do_tcp_sendpages(struct sock *sk, struct page *page, int offset,
    1028             :                          size_t size, int flags)
    1029             : {
    1030           0 :         struct tcp_sock *tp = tcp_sk(sk);
    1031           0 :         int mss_now, size_goal;
    1032           0 :         int err;
    1033           0 :         ssize_t copied;
    1034           0 :         long timeo = sock_sndtimeo(sk, flags & MSG_DONTWAIT);
    1035             : 
    1036           0 :         if (IS_ENABLED(CONFIG_DEBUG_VM) &&
    1037           0 :             WARN_ONCE(!sendpage_ok(page),
    1038             :                       "page must not be a Slab one and have page_count > 0"))
    1039             :                 return -EINVAL;
    1040             : 
    1041             :         /* Wait for a connection to finish. One exception is TCP Fast Open
    1042             :          * (passive side) where data is allowed to be sent before a connection
    1043             :          * is fully established.
    1044             :          */
    1045           0 :         if (((1 << sk->sk_state) & ~(TCPF_ESTABLISHED | TCPF_CLOSE_WAIT)) &&
    1046           0 :             !tcp_passive_fastopen(sk)) {
    1047           0 :                 err = sk_stream_wait_connect(sk, &timeo);
    1048           0 :                 if (err != 0)
    1049           0 :                         goto out_err;
    1050             :         }
    1051             : 
    1052           0 :         sk_clear_bit(SOCKWQ_ASYNC_NOSPACE, sk);
    1053             : 
    1054           0 :         mss_now = tcp_send_mss(sk, &size_goal, flags);
    1055           0 :         copied = 0;
    1056             : 
    1057           0 :         err = -EPIPE;
    1058           0 :         if (sk->sk_err || (sk->sk_shutdown & SEND_SHUTDOWN))
    1059           0 :                 goto out_err;
    1060             : 
    1061           0 :         while (size > 0) {
    1062           0 :                 struct sk_buff *skb;
    1063           0 :                 size_t copy = size;
    1064             : 
    1065           0 :                 skb = tcp_build_frag(sk, size_goal, flags, page, offset, &copy);
    1066           0 :                 if (!skb)
    1067           0 :                         goto wait_for_space;
    1068             : 
    1069           0 :                 if (!copied)
    1070           0 :                         TCP_SKB_CB(skb)->tcp_flags &= ~TCPHDR_PSH;
    1071             : 
    1072           0 :                 copied += copy;
    1073           0 :                 offset += copy;
    1074           0 :                 size -= copy;
    1075           0 :                 if (!size)
    1076           0 :                         goto out;
    1077             : 
    1078           0 :                 if (skb->len < size_goal || (flags & MSG_OOB))
    1079           0 :                         continue;
    1080             : 
    1081           0 :                 if (forced_push(tp)) {
    1082           0 :                         tcp_mark_push(tp, skb);
    1083           0 :                         __tcp_push_pending_frames(sk, mss_now, TCP_NAGLE_PUSH);
    1084           0 :                 } else if (skb == tcp_send_head(sk))
    1085           0 :                         tcp_push_one(sk, mss_now);
    1086           0 :                 continue;
    1087             : 
    1088           0 : wait_for_space:
    1089           0 :                 set_bit(SOCK_NOSPACE, &sk->sk_socket->flags);
    1090           0 :                 tcp_push(sk, flags & ~MSG_MORE, mss_now,
    1091             :                          TCP_NAGLE_PUSH, size_goal);
    1092             : 
    1093           0 :                 err = sk_stream_wait_memory(sk, &timeo);
    1094           0 :                 if (err != 0)
    1095           0 :                         goto do_error;
    1096             : 
    1097           0 :                 mss_now = tcp_send_mss(sk, &size_goal, flags);
    1098             :         }
    1099             : 
    1100           0 : out:
    1101           0 :         if (copied) {
    1102           0 :                 tcp_tx_timestamp(sk, sk->sk_tsflags);
    1103           0 :                 if (!(flags & MSG_SENDPAGE_NOTLAST))
    1104           0 :                         tcp_push(sk, flags, mss_now, tp->nonagle, size_goal);
    1105             :         }
    1106             :         return copied;
    1107             : 
    1108           0 : do_error:
    1109           0 :         tcp_remove_empty_skb(sk, tcp_write_queue_tail(sk));
    1110           0 :         if (copied)
    1111           0 :                 goto out;
    1112           0 : out_err:
    1113             :         /* make sure we wake any epoll edge trigger waiter */
    1114           0 :         if (unlikely(tcp_rtx_and_write_queues_empty(sk) && err == -EAGAIN)) {
    1115           0 :                 sk->sk_write_space(sk);
    1116           0 :                 tcp_chrono_stop(sk, TCP_CHRONO_SNDBUF_LIMITED);
    1117             :         }
    1118           0 :         return sk_stream_error(sk, flags, err);
    1119             : }
    1120             : EXPORT_SYMBOL_GPL(do_tcp_sendpages);
    1121             : 
    1122           0 : int tcp_sendpage_locked(struct sock *sk, struct page *page, int offset,
    1123             :                         size_t size, int flags)
    1124             : {
    1125           0 :         if (!(sk->sk_route_caps & NETIF_F_SG))
    1126           0 :                 return sock_no_sendpage_locked(sk, page, offset, size, flags);
    1127             : 
    1128           0 :         tcp_rate_check_app_limited(sk);  /* is sending application-limited? */
    1129             : 
    1130           0 :         return do_tcp_sendpages(sk, page, offset, size, flags);
    1131             : }
    1132             : EXPORT_SYMBOL_GPL(tcp_sendpage_locked);
    1133             : 
    1134           0 : int tcp_sendpage(struct sock *sk, struct page *page, int offset,
    1135             :                  size_t size, int flags)
    1136             : {
    1137           0 :         int ret;
    1138             : 
    1139           0 :         lock_sock(sk);
    1140           0 :         ret = tcp_sendpage_locked(sk, page, offset, size, flags);
    1141           0 :         release_sock(sk);
    1142             : 
    1143           0 :         return ret;
    1144             : }
    1145             : EXPORT_SYMBOL(tcp_sendpage);
    1146             : 
    1147           4 : void tcp_free_fastopen_req(struct tcp_sock *tp)
    1148             : {
    1149           4 :         if (tp->fastopen_req) {
    1150           0 :                 kfree(tp->fastopen_req);
    1151           0 :                 tp->fastopen_req = NULL;
    1152             :         }
    1153           4 : }
    1154             : 
    1155           0 : static int tcp_sendmsg_fastopen(struct sock *sk, struct msghdr *msg,
    1156             :                                 int *copied, size_t size,
    1157             :                                 struct ubuf_info *uarg)
    1158             : {
    1159           0 :         struct tcp_sock *tp = tcp_sk(sk);
    1160           0 :         struct inet_sock *inet = inet_sk(sk);
    1161           0 :         struct sockaddr *uaddr = msg->msg_name;
    1162           0 :         int err, flags;
    1163             : 
    1164           0 :         if (!(sock_net(sk)->ipv4.sysctl_tcp_fastopen & TFO_CLIENT_ENABLE) ||
    1165           0 :             (uaddr && msg->msg_namelen >= sizeof(uaddr->sa_family) &&
    1166           0 :              uaddr->sa_family == AF_UNSPEC))
    1167             :                 return -EOPNOTSUPP;
    1168           0 :         if (tp->fastopen_req)
    1169             :                 return -EALREADY; /* Another Fast Open is in progress */
    1170             : 
    1171           0 :         tp->fastopen_req = kzalloc(sizeof(struct tcp_fastopen_request),
    1172             :                                    sk->sk_allocation);
    1173           0 :         if (unlikely(!tp->fastopen_req))
    1174             :                 return -ENOBUFS;
    1175           0 :         tp->fastopen_req->data = msg;
    1176           0 :         tp->fastopen_req->size = size;
    1177           0 :         tp->fastopen_req->uarg = uarg;
    1178             : 
    1179           0 :         if (inet->defer_connect) {
    1180           0 :                 err = tcp_connect(sk);
    1181             :                 /* Same failure procedure as in tcp_v4/6_connect */
    1182           0 :                 if (err) {
    1183           0 :                         tcp_set_state(sk, TCP_CLOSE);
    1184           0 :                         inet->inet_dport = 0;
    1185           0 :                         sk->sk_route_caps = 0;
    1186             :                 }
    1187             :         }
    1188           0 :         flags = (msg->msg_flags & MSG_DONTWAIT) ? O_NONBLOCK : 0;
    1189           0 :         err = __inet_stream_connect(sk->sk_socket, uaddr,
    1190             :                                     msg->msg_namelen, flags, 1);
    1191             :         /* fastopen_req could already be freed in __inet_stream_connect
    1192             :          * if the connection times out or gets rst
    1193             :          */
    1194           0 :         if (tp->fastopen_req) {
    1195           0 :                 *copied = tp->fastopen_req->copied;
    1196           0 :                 tcp_free_fastopen_req(tp);
    1197           0 :                 inet->defer_connect = 0;
    1198             :         }
    1199             :         return err;
    1200             : }
    1201             : 
    1202         411 : int tcp_sendmsg_locked(struct sock *sk, struct msghdr *msg, size_t size)
    1203             : {
    1204         411 :         struct tcp_sock *tp = tcp_sk(sk);
    1205         411 :         struct ubuf_info *uarg = NULL;
    1206         411 :         struct sk_buff *skb;
    1207         411 :         struct sockcm_cookie sockc;
    1208         411 :         int flags, err, copied = 0;
    1209         411 :         int mss_now = 0, size_goal, copied_syn = 0;
    1210         411 :         int process_backlog = 0;
    1211         411 :         bool zc = false;
    1212         411 :         long timeo;
    1213             : 
    1214         411 :         flags = msg->msg_flags;
    1215             : 
    1216         411 :         if (flags & MSG_ZEROCOPY && size && sock_flag(sk, SOCK_ZEROCOPY)) {
    1217           0 :                 skb = tcp_write_queue_tail(sk);
    1218           0 :                 uarg = msg_zerocopy_realloc(sk, size, skb_zcopy(skb));
    1219           0 :                 if (!uarg) {
    1220           0 :                         err = -ENOBUFS;
    1221           0 :                         goto out_err;
    1222             :                 }
    1223             : 
    1224           0 :                 zc = sk->sk_route_caps & NETIF_F_SG;
    1225           0 :                 if (!zc)
    1226           0 :                         uarg->zerocopy = 0;
    1227             :         }
    1228             : 
    1229         411 :         if (unlikely(flags & MSG_FASTOPEN || inet_sk(sk)->defer_connect) &&
    1230           0 :             !tp->repair) {
    1231           0 :                 err = tcp_sendmsg_fastopen(sk, msg, &copied_syn, size, uarg);
    1232           0 :                 if (err == -EINPROGRESS && copied_syn > 0)
    1233           0 :                         goto out;
    1234           0 :                 else if (err)
    1235           0 :                         goto out_err;
    1236             :         }
    1237             : 
    1238         411 :         timeo = sock_sndtimeo(sk, flags & MSG_DONTWAIT);
    1239             : 
    1240         411 :         tcp_rate_check_app_limited(sk);  /* is sending application-limited? */
    1241             : 
    1242             :         /* Wait for a connection to finish. One exception is TCP Fast Open
    1243             :          * (passive side) where data is allowed to be sent before a connection
    1244             :          * is fully established.
    1245             :          */
    1246         411 :         if (((1 << sk->sk_state) & ~(TCPF_ESTABLISHED | TCPF_CLOSE_WAIT)) &&
    1247           0 :             !tcp_passive_fastopen(sk)) {
    1248           0 :                 err = sk_stream_wait_connect(sk, &timeo);
    1249           0 :                 if (err != 0)
    1250           0 :                         goto do_error;
    1251             :         }
    1252             : 
    1253         411 :         if (unlikely(tp->repair)) {
    1254           0 :                 if (tp->repair_queue == TCP_RECV_QUEUE) {
    1255           0 :                         copied = tcp_send_rcvq(sk, msg, size);
    1256           0 :                         goto out_nopush;
    1257             :                 }
    1258             : 
    1259           0 :                 err = -EINVAL;
    1260           0 :                 if (tp->repair_queue == TCP_NO_QUEUE)
    1261           0 :                         goto out_err;
    1262             : 
    1263             :                 /* 'common' sending to sendq */
    1264             :         }
    1265             : 
    1266         411 :         sockcm_init(&sockc, sk);
    1267         411 :         if (msg->msg_controllen) {
    1268           0 :                 err = sock_cmsg_send(sk, msg, &sockc);
    1269           0 :                 if (unlikely(err)) {
    1270           0 :                         err = -EINVAL;
    1271           0 :                         goto out_err;
    1272             :                 }
    1273             :         }
    1274             : 
    1275             :         /* This should be in poll */
    1276         411 :         sk_clear_bit(SOCKWQ_ASYNC_NOSPACE, sk);
    1277             : 
    1278             :         /* Ok commence sending. */
    1279         411 :         copied = 0;
    1280             : 
    1281         411 : restart:
    1282         411 :         mss_now = tcp_send_mss(sk, &size_goal, flags);
    1283             : 
    1284         411 :         err = -EPIPE;
    1285         411 :         if (sk->sk_err || (sk->sk_shutdown & SEND_SHUTDOWN))
    1286           0 :                 goto do_error;
    1287             : 
    1288         411 :         while (msg_data_left(msg)) {
    1289         411 :                 int copy = 0;
    1290             : 
    1291         411 :                 skb = tcp_write_queue_tail(sk);
    1292          50 :                 if (skb)
    1293          50 :                         copy = size_goal - skb->len;
    1294             : 
    1295         411 :                 if (copy <= 0 || !tcp_skb_can_collapse_to(skb)) {
    1296         361 :                         bool first_skb;
    1297             : 
    1298         361 : new_segment:
    1299         361 :                         if (!sk_stream_memory_free(sk))
    1300           0 :                                 goto wait_for_space;
    1301             : 
    1302         361 :                         if (unlikely(process_backlog >= 16)) {
    1303           0 :                                 process_backlog = 0;
    1304           0 :                                 if (sk_flush_backlog(sk))
    1305           0 :                                         goto restart;
    1306             :                         }
    1307         361 :                         first_skb = tcp_rtx_and_write_queues_empty(sk);
    1308         361 :                         skb = sk_stream_alloc_skb(sk, 0, sk->sk_allocation,
    1309             :                                                   first_skb);
    1310         361 :                         if (!skb)
    1311           0 :                                 goto wait_for_space;
    1312             : 
    1313         361 :                         process_backlog++;
    1314         361 :                         skb->ip_summed = CHECKSUM_PARTIAL;
    1315             : 
    1316         361 :                         skb_entail(sk, skb);
    1317         361 :                         copy = size_goal;
    1318             : 
    1319             :                         /* All packets are restored as if they have
    1320             :                          * already been sent. skb_mstamp_ns isn't set to
    1321             :                          * avoid wrong rtt estimation.
    1322             :                          */
    1323         361 :                         if (tp->repair)
    1324           0 :                                 TCP_SKB_CB(skb)->sacked |= TCPCB_REPAIRED;
    1325             :                 }
    1326             : 
    1327             :                 /* Try to append data to the end of skb. */
    1328         411 :                 if (copy > msg_data_left(msg))
    1329         411 :                         copy = msg_data_left(msg);
    1330             : 
    1331             :                 /* Where to copy to? */
    1332         411 :                 if (skb_availroom(skb) > 0 && !zc) {
    1333             :                         /* We have some space in skb head. Superb! */
    1334           0 :                         copy = min_t(int, copy, skb_availroom(skb));
    1335           0 :                         err = skb_add_data_nocache(sk, skb, &msg->msg_iter, copy);
    1336           0 :                         if (err)
    1337           0 :                                 goto do_fault;
    1338         411 :                 } else if (!zc) {
    1339         411 :                         bool merge = true;
    1340         411 :                         int i = skb_shinfo(skb)->nr_frags;
    1341         411 :                         struct page_frag *pfrag = sk_page_frag(sk);
    1342             : 
    1343         411 :                         if (!sk_page_frag_refill(sk, pfrag))
    1344           0 :                                 goto wait_for_space;
    1345             : 
    1346         411 :                         if (!skb_can_coalesce(skb, i, pfrag->page,
    1347         411 :                                               pfrag->offset)) {
    1348         361 :                                 if (i >= sysctl_max_skb_frags) {
    1349           0 :                                         tcp_mark_push(tp, skb);
    1350           0 :                                         goto new_segment;
    1351             :                                 }
    1352             :                                 merge = false;
    1353             :                         }
    1354             : 
    1355         411 :                         copy = min_t(int, copy, pfrag->size - pfrag->offset);
    1356             : 
    1357         411 :                         if (!sk_wmem_schedule(sk, copy))
    1358           0 :                                 goto wait_for_space;
    1359             : 
    1360         822 :                         err = skb_copy_to_page_nocache(sk, &msg->msg_iter, skb,
    1361             :                                                        pfrag->page,
    1362         411 :                                                        pfrag->offset,
    1363             :                                                        copy);
    1364         411 :                         if (err)
    1365           0 :                                 goto do_error;
    1366             : 
    1367             :                         /* Update the skb. */
    1368         411 :                         if (merge) {
    1369          50 :                                 skb_frag_size_add(&skb_shinfo(skb)->frags[i - 1], copy);
    1370             :                         } else {
    1371         361 :                                 skb_fill_page_desc(skb, i, pfrag->page,
    1372         361 :                                                    pfrag->offset, copy);
    1373         361 :                                 page_ref_inc(pfrag->page);
    1374             :                         }
    1375         411 :                         pfrag->offset += copy;
    1376             :                 } else {
    1377           0 :                         err = skb_zerocopy_iter_stream(sk, skb, msg, copy, uarg);
    1378           0 :                         if (err == -EMSGSIZE || err == -EEXIST) {
    1379           0 :                                 tcp_mark_push(tp, skb);
    1380           0 :                                 goto new_segment;
    1381             :                         }
    1382           0 :                         if (err < 0)
    1383           0 :                                 goto do_error;
    1384             :                         copy = err;
    1385             :                 }
    1386             : 
    1387         411 :                 if (!copied)
    1388         411 :                         TCP_SKB_CB(skb)->tcp_flags &= ~TCPHDR_PSH;
    1389             : 
    1390         411 :                 WRITE_ONCE(tp->write_seq, tp->write_seq + copy);
    1391         411 :                 TCP_SKB_CB(skb)->end_seq += copy;
    1392         411 :                 tcp_skb_pcount_set(skb, 0);
    1393             : 
    1394         411 :                 copied += copy;
    1395         411 :                 if (!msg_data_left(msg)) {
    1396         411 :                         if (unlikely(flags & MSG_EOR))
    1397           0 :                                 TCP_SKB_CB(skb)->eor = 1;
    1398         411 :                         goto out;
    1399             :                 }
    1400             : 
    1401           0 :                 if (skb->len < size_goal || (flags & MSG_OOB) || unlikely(tp->repair))
    1402           0 :                         continue;
    1403             : 
    1404           0 :                 if (forced_push(tp)) {
    1405           0 :                         tcp_mark_push(tp, skb);
    1406           0 :                         __tcp_push_pending_frames(sk, mss_now, TCP_NAGLE_PUSH);
    1407           0 :                 } else if (skb == tcp_send_head(sk))
    1408           0 :                         tcp_push_one(sk, mss_now);
    1409           0 :                 continue;
    1410             : 
    1411           0 : wait_for_space:
    1412           0 :                 set_bit(SOCK_NOSPACE, &sk->sk_socket->flags);
    1413           0 :                 if (copied)
    1414           0 :                         tcp_push(sk, flags & ~MSG_MORE, mss_now,
    1415             :                                  TCP_NAGLE_PUSH, size_goal);
    1416             : 
    1417           0 :                 err = sk_stream_wait_memory(sk, &timeo);
    1418           0 :                 if (err != 0)
    1419           0 :                         goto do_error;
    1420             : 
    1421           0 :                 mss_now = tcp_send_mss(sk, &size_goal, flags);
    1422             :         }
    1423             : 
    1424           0 : out:
    1425         411 :         if (copied) {
    1426         411 :                 tcp_tx_timestamp(sk, sockc.tsflags);
    1427         411 :                 tcp_push(sk, flags, mss_now, tp->nonagle, size_goal);
    1428             :         }
    1429           0 : out_nopush:
    1430         411 :         net_zcopy_put(uarg);
    1431         411 :         return copied + copied_syn;
    1432             : 
    1433           0 : do_error:
    1434           0 :         skb = tcp_write_queue_tail(sk);
    1435           0 : do_fault:
    1436           0 :         tcp_remove_empty_skb(sk, skb);
    1437             : 
    1438           0 :         if (copied + copied_syn)
    1439           0 :                 goto out;
    1440           0 : out_err:
    1441           0 :         net_zcopy_put_abort(uarg, true);
    1442           0 :         err = sk_stream_error(sk, flags, err);
    1443             :         /* make sure we wake any epoll edge trigger waiter */
    1444           0 :         if (unlikely(tcp_rtx_and_write_queues_empty(sk) && err == -EAGAIN)) {
    1445           0 :                 sk->sk_write_space(sk);
    1446           0 :                 tcp_chrono_stop(sk, TCP_CHRONO_SNDBUF_LIMITED);
    1447             :         }
    1448             :         return err;
    1449             : }
    1450             : EXPORT_SYMBOL_GPL(tcp_sendmsg_locked);
    1451             : 
    1452         411 : int tcp_sendmsg(struct sock *sk, struct msghdr *msg, size_t size)
    1453             : {
    1454         411 :         int ret;
    1455             : 
    1456         411 :         lock_sock(sk);
    1457         411 :         ret = tcp_sendmsg_locked(sk, msg, size);
    1458         411 :         release_sock(sk);
    1459             : 
    1460         411 :         return ret;
    1461             : }
    1462             : EXPORT_SYMBOL(tcp_sendmsg);
    1463             : 
    1464             : /*
    1465             :  *      Handle reading urgent data. BSD has very simple semantics for
    1466             :  *      this, no blocking and very strange errors 8)
    1467             :  */
    1468             : 
    1469           0 : static int tcp_recv_urg(struct sock *sk, struct msghdr *msg, int len, int flags)
    1470             : {
    1471           0 :         struct tcp_sock *tp = tcp_sk(sk);
    1472             : 
    1473             :         /* No URG data to read. */
    1474           0 :         if (sock_flag(sk, SOCK_URGINLINE) || !tp->urg_data ||
    1475             :             tp->urg_data == TCP_URG_READ)
    1476             :                 return -EINVAL; /* Yes this is right ! */
    1477             : 
    1478           0 :         if (sk->sk_state == TCP_CLOSE && !sock_flag(sk, SOCK_DONE))
    1479             :                 return -ENOTCONN;
    1480             : 
    1481           0 :         if (tp->urg_data & TCP_URG_VALID) {
    1482           0 :                 int err = 0;
    1483           0 :                 char c = tp->urg_data;
    1484             : 
    1485           0 :                 if (!(flags & MSG_PEEK))
    1486           0 :                         tp->urg_data = TCP_URG_READ;
    1487             : 
    1488             :                 /* Read urgent data. */
    1489           0 :                 msg->msg_flags |= MSG_OOB;
    1490             : 
    1491           0 :                 if (len > 0) {
    1492           0 :                         if (!(flags & MSG_TRUNC))
    1493           0 :                                 err = memcpy_to_msg(msg, &c, 1);
    1494           0 :                         len = 1;
    1495             :                 } else
    1496           0 :                         msg->msg_flags |= MSG_TRUNC;
    1497             : 
    1498           0 :                 return err ? -EFAULT : len;
    1499             :         }
    1500             : 
    1501           0 :         if (sk->sk_state == TCP_CLOSE || (sk->sk_shutdown & RCV_SHUTDOWN))
    1502           0 :                 return 0;
    1503             : 
    1504             :         /* Fixed the recv(..., MSG_OOB) behaviour.  BSD docs and
    1505             :          * the available implementations agree in this case:
    1506             :          * this call should never block, independent of the
    1507             :          * blocking state of the socket.
    1508             :          * Mike <pall@rz.uni-karlsruhe.de>
    1509             :          */
    1510             :         return -EAGAIN;
    1511             : }
    1512             : 
    1513           0 : static int tcp_peek_sndq(struct sock *sk, struct msghdr *msg, int len)
    1514             : {
    1515           0 :         struct sk_buff *skb;
    1516           0 :         int copied = 0, err = 0;
    1517             : 
    1518             :         /* XXX -- need to support SO_PEEK_OFF */
    1519             : 
    1520           0 :         skb_rbtree_walk(skb, &sk->tcp_rtx_queue) {
    1521           0 :                 err = skb_copy_datagram_msg(skb, 0, msg, skb->len);
    1522           0 :                 if (err)
    1523           0 :                         return err;
    1524           0 :                 copied += skb->len;
    1525             :         }
    1526             : 
    1527           0 :         skb_queue_walk(&sk->sk_write_queue, skb) {
    1528           0 :                 err = skb_copy_datagram_msg(skb, 0, msg, skb->len);
    1529           0 :                 if (err)
    1530             :                         break;
    1531             : 
    1532           0 :                 copied += skb->len;
    1533             :         }
    1534             : 
    1535           0 :         return err ?: copied;
    1536             : }
    1537             : 
    1538             : /* Clean up the receive buffer for full frames taken by the user,
    1539             :  * then send an ACK if necessary.  COPIED is the number of bytes
    1540             :  * tcp_recvmsg has given to the user so far, it speeds up the
    1541             :  * calculation of whether or not we must ACK for the sake of
    1542             :  * a window update.
    1543             :  */
    1544         201 : void tcp_cleanup_rbuf(struct sock *sk, int copied)
    1545             : {
    1546         201 :         struct tcp_sock *tp = tcp_sk(sk);
    1547         201 :         bool time_to_ack = false;
    1548             : 
    1549         201 :         struct sk_buff *skb = skb_peek(&sk->sk_receive_queue);
    1550             : 
    1551         349 :         WARN(skb && !before(tp->copied_seq, TCP_SKB_CB(skb)->end_seq),
    1552             :              "cleanup rbuf bug: copied %X seq %X rcvnxt %X\n",
    1553             :              tp->copied_seq, TCP_SKB_CB(skb)->end_seq, tp->rcv_nxt);
    1554             : 
    1555         201 :         if (inet_csk_ack_scheduled(sk)) {
    1556           9 :                 const struct inet_connection_sock *icsk = inet_csk(sk);
    1557             : 
    1558           9 :                 if (/* Once-per-two-segments ACK was not sent by tcp_input.c */
    1559           9 :                     tp->rcv_nxt - tp->rcv_wup > icsk->icsk_ack.rcv_mss ||
    1560             :                     /*
    1561             :                      * If this read emptied read buffer, we send ACK, if
    1562             :                      * connection is not bidirectional, user drained
    1563             :                      * receive buffer and there was a small segment
    1564             :                      * in queue.
    1565             :                      */
    1566           8 :                     (copied > 0 &&
    1567           7 :                      ((icsk->icsk_ack.pending & ICSK_ACK_PUSHED2) ||
    1568           6 :                       ((icsk->icsk_ack.pending & ICSK_ACK_PUSHED) &&
    1569           6 :                        !inet_csk_in_pingpong_mode(sk))) &&
    1570           3 :                       !atomic_read(&sk->sk_rmem_alloc)))
    1571             :                         time_to_ack = true;
    1572             :         }
    1573             : 
    1574             :         /* We send an ACK if we can now advertise a non-zero window
    1575             :          * which has been raised "significantly".
    1576             :          *
    1577             :          * Even if window raised up to infinity, do not send window open ACK
    1578             :          * in states, where we will not receive more. It is useless.
    1579             :          */
    1580         201 :         if (copied > 0 && !time_to_ack && !(sk->sk_shutdown & RCV_SHUTDOWN)) {
    1581         196 :                 __u32 rcv_window_now = tcp_receive_window(tp);
    1582             : 
    1583             :                 /* Optimize, __tcp_select_window() is not cheap. */
    1584         196 :                 if (2*rcv_window_now <= tp->window_clamp) {
    1585          23 :                         __u32 new_window = __tcp_select_window(sk);
    1586             : 
    1587             :                         /* Send ACK now, if this read freed lots of space
    1588             :                          * in our buffer. Certainly, new_window is new window.
    1589             :                          * We can advertise it now, if it is not less than current one.
    1590             :                          * "Lots" means "at least twice" here.
    1591             :                          */
    1592          23 :                         if (new_window && new_window >= 2 * rcv_window_now)
    1593             :                                 time_to_ack = true;
    1594             :                 }
    1595             :         }
    1596         201 :         if (time_to_ack)
    1597           4 :                 tcp_send_ack(sk);
    1598         201 : }
    1599             : 
    1600           0 : static struct sk_buff *tcp_recv_skb(struct sock *sk, u32 seq, u32 *off)
    1601             : {
    1602           0 :         struct sk_buff *skb;
    1603           0 :         u32 offset;
    1604             : 
    1605           0 :         while ((skb = skb_peek(&sk->sk_receive_queue)) != NULL) {
    1606           0 :                 offset = seq - TCP_SKB_CB(skb)->seq;
    1607           0 :                 if (unlikely(TCP_SKB_CB(skb)->tcp_flags & TCPHDR_SYN)) {
    1608           0 :                         pr_err_once("%s: found a SYN, please report !\n", __func__);
    1609           0 :                         offset--;
    1610             :                 }
    1611           0 :                 if (offset < skb->len || (TCP_SKB_CB(skb)->tcp_flags & TCPHDR_FIN)) {
    1612           0 :                         *off = offset;
    1613           0 :                         return skb;
    1614             :                 }
    1615             :                 /* This looks weird, but this can happen if TCP collapsing
    1616             :                  * splitted a fat GRO packet, while we released socket lock
    1617             :                  * in skb_splice_bits()
    1618             :                  */
    1619           0 :                 sk_eat_skb(sk, skb);
    1620             :         }
    1621             :         return NULL;
    1622             : }
    1623             : 
    1624             : /*
    1625             :  * This routine provides an alternative to tcp_recvmsg() for routines
    1626             :  * that would like to handle copying from skbuffs directly in 'sendfile'
    1627             :  * fashion.
    1628             :  * Note:
    1629             :  *      - It is assumed that the socket was locked by the caller.
    1630             :  *      - The routine does not block.
    1631             :  *      - At present, there is no support for reading OOB data
    1632             :  *        or for 'peeking' the socket using this routine
    1633             :  *        (although both would be easy to implement).
    1634             :  */
    1635           0 : int tcp_read_sock(struct sock *sk, read_descriptor_t *desc,
    1636             :                   sk_read_actor_t recv_actor)
    1637             : {
    1638           0 :         struct sk_buff *skb;
    1639           0 :         struct tcp_sock *tp = tcp_sk(sk);
    1640           0 :         u32 seq = tp->copied_seq;
    1641           0 :         u32 offset;
    1642           0 :         int copied = 0;
    1643             : 
    1644           0 :         if (sk->sk_state == TCP_LISTEN)
    1645             :                 return -ENOTCONN;
    1646           0 :         while ((skb = tcp_recv_skb(sk, seq, &offset)) != NULL) {
    1647           0 :                 if (offset < skb->len) {
    1648           0 :                         int used;
    1649           0 :                         size_t len;
    1650             : 
    1651           0 :                         len = skb->len - offset;
    1652             :                         /* Stop reading if we hit a patch of urgent data */
    1653           0 :                         if (tp->urg_data) {
    1654           0 :                                 u32 urg_offset = tp->urg_seq - seq;
    1655           0 :                                 if (urg_offset < len)
    1656           0 :                                         len = urg_offset;
    1657           0 :                                 if (!len)
    1658             :                                         break;
    1659             :                         }
    1660           0 :                         used = recv_actor(desc, skb, offset, len);
    1661           0 :                         if (used <= 0) {
    1662           0 :                                 if (!copied)
    1663           0 :                                         copied = used;
    1664             :                                 break;
    1665           0 :                         } else if (used <= len) {
    1666           0 :                                 seq += used;
    1667           0 :                                 copied += used;
    1668           0 :                                 offset += used;
    1669             :                         }
    1670             :                         /* If recv_actor drops the lock (e.g. TCP splice
    1671             :                          * receive) the skb pointer might be invalid when
    1672             :                          * getting here: tcp_collapse might have deleted it
    1673             :                          * while aggregating skbs from the socket queue.
    1674             :                          */
    1675           0 :                         skb = tcp_recv_skb(sk, seq - 1, &offset);
    1676           0 :                         if (!skb)
    1677             :                                 break;
    1678             :                         /* TCP coalescing might have appended data to the skb.
    1679             :                          * Try to splice more frags
    1680             :                          */
    1681           0 :                         if (offset + 1 != skb->len)
    1682           0 :                                 continue;
    1683             :                 }
    1684           0 :                 if (TCP_SKB_CB(skb)->tcp_flags & TCPHDR_FIN) {
    1685           0 :                         sk_eat_skb(sk, skb);
    1686           0 :                         ++seq;
    1687           0 :                         break;
    1688             :                 }
    1689           0 :                 sk_eat_skb(sk, skb);
    1690           0 :                 if (!desc->count)
    1691             :                         break;
    1692           0 :                 WRITE_ONCE(tp->copied_seq, seq);
    1693             :         }
    1694           0 :         WRITE_ONCE(tp->copied_seq, seq);
    1695             : 
    1696           0 :         tcp_rcv_space_adjust(sk);
    1697             : 
    1698             :         /* Clean up data we have read: This will do ACK frames. */
    1699           0 :         if (copied > 0) {
    1700           0 :                 tcp_recv_skb(sk, seq, &offset);
    1701           0 :                 tcp_cleanup_rbuf(sk, copied);
    1702             :         }
    1703             :         return copied;
    1704             : }
    1705             : EXPORT_SYMBOL(tcp_read_sock);
    1706             : 
    1707           0 : int tcp_peek_len(struct socket *sock)
    1708             : {
    1709           0 :         return tcp_inq(sock->sk);
    1710             : }
    1711             : EXPORT_SYMBOL(tcp_peek_len);
    1712             : 
    1713             : /* Make sure sk_rcvbuf is big enough to satisfy SO_RCVLOWAT hint */
    1714           0 : int tcp_set_rcvlowat(struct sock *sk, int val)
    1715             : {
    1716           0 :         int cap;
    1717             : 
    1718           0 :         if (sk->sk_userlocks & SOCK_RCVBUF_LOCK)
    1719           0 :                 cap = sk->sk_rcvbuf >> 1;
    1720             :         else
    1721           0 :                 cap = sock_net(sk)->ipv4.sysctl_tcp_rmem[2] >> 1;
    1722           0 :         val = min(val, cap);
    1723           0 :         WRITE_ONCE(sk->sk_rcvlowat, val ? : 1);
    1724             : 
    1725             :         /* Check if we need to signal EPOLLIN right now */
    1726           0 :         tcp_data_ready(sk);
    1727             : 
    1728           0 :         if (sk->sk_userlocks & SOCK_RCVBUF_LOCK)
    1729             :                 return 0;
    1730             : 
    1731           0 :         val <<= 1;
    1732           0 :         if (val > sk->sk_rcvbuf) {
    1733           0 :                 WRITE_ONCE(sk->sk_rcvbuf, val);
    1734           0 :                 tcp_sk(sk)->window_clamp = tcp_win_from_space(sk, val);
    1735             :         }
    1736             :         return 0;
    1737             : }
    1738             : EXPORT_SYMBOL(tcp_set_rcvlowat);
    1739             : 
    1740           0 : static void tcp_update_recv_tstamps(struct sk_buff *skb,
    1741             :                                     struct scm_timestamping_internal *tss)
    1742             : {
    1743           0 :         if (skb->tstamp)
    1744           0 :                 tss->ts[0] = ktime_to_timespec64(skb->tstamp);
    1745             :         else
    1746           0 :                 tss->ts[0] = (struct timespec64) {0};
    1747             : 
    1748           0 :         if (skb_hwtstamps(skb)->hwtstamp)
    1749           0 :                 tss->ts[2] = ktime_to_timespec64(skb_hwtstamps(skb)->hwtstamp);
    1750             :         else
    1751           0 :                 tss->ts[2] = (struct timespec64) {0};
    1752           0 : }
    1753             : 
    1754             : #ifdef CONFIG_MMU
    1755             : static const struct vm_operations_struct tcp_vm_ops = {
    1756             : };
    1757             : 
    1758           0 : int tcp_mmap(struct file *file, struct socket *sock,
    1759             :              struct vm_area_struct *vma)
    1760             : {
    1761           0 :         if (vma->vm_flags & (VM_WRITE | VM_EXEC))
    1762             :                 return -EPERM;
    1763           0 :         vma->vm_flags &= ~(VM_MAYWRITE | VM_MAYEXEC);
    1764             : 
    1765             :         /* Instruct vm_insert_page() to not mmap_read_lock(mm) */
    1766           0 :         vma->vm_flags |= VM_MIXEDMAP;
    1767             : 
    1768           0 :         vma->vm_ops = &tcp_vm_ops;
    1769           0 :         return 0;
    1770             : }
    1771             : EXPORT_SYMBOL(tcp_mmap);
    1772             : 
    1773           0 : static skb_frag_t *skb_advance_to_frag(struct sk_buff *skb, u32 offset_skb,
    1774             :                                        u32 *offset_frag)
    1775             : {
    1776           0 :         skb_frag_t *frag;
    1777             : 
    1778           0 :         offset_skb -= skb_headlen(skb);
    1779           0 :         if ((int)offset_skb < 0 || skb_has_frag_list(skb))
    1780             :                 return NULL;
    1781             : 
    1782           0 :         frag = skb_shinfo(skb)->frags;
    1783           0 :         while (offset_skb) {
    1784           0 :                 if (skb_frag_size(frag) > offset_skb) {
    1785           0 :                         *offset_frag = offset_skb;
    1786           0 :                         return frag;
    1787             :                 }
    1788           0 :                 offset_skb -= skb_frag_size(frag);
    1789           0 :                 ++frag;
    1790             :         }
    1791           0 :         *offset_frag = 0;
    1792           0 :         return frag;
    1793             : }
    1794             : 
    1795           0 : static bool can_map_frag(const skb_frag_t *frag)
    1796             : {
    1797           0 :         return skb_frag_size(frag) == PAGE_SIZE && !skb_frag_off(frag);
    1798             : }
    1799             : 
    1800           0 : static int find_next_mappable_frag(const skb_frag_t *frag,
    1801             :                                    int remaining_in_skb)
    1802             : {
    1803           0 :         int offset = 0;
    1804             : 
    1805           0 :         if (likely(can_map_frag(frag)))
    1806             :                 return 0;
    1807             : 
    1808           0 :         while (offset < remaining_in_skb && !can_map_frag(frag)) {
    1809           0 :                 offset += skb_frag_size(frag);
    1810           0 :                 ++frag;
    1811             :         }
    1812             :         return offset;
    1813             : }
    1814             : 
    1815           0 : static void tcp_zerocopy_set_hint_for_skb(struct sock *sk,
    1816             :                                           struct tcp_zerocopy_receive *zc,
    1817             :                                           struct sk_buff *skb, u32 offset)
    1818             : {
    1819           0 :         u32 frag_offset, partial_frag_remainder = 0;
    1820           0 :         int mappable_offset;
    1821           0 :         skb_frag_t *frag;
    1822             : 
    1823             :         /* worst case: skip to next skb. try to improve on this case below */
    1824           0 :         zc->recv_skip_hint = skb->len - offset;
    1825             : 
    1826             :         /* Find the frag containing this offset (and how far into that frag) */
    1827           0 :         frag = skb_advance_to_frag(skb, offset, &frag_offset);
    1828           0 :         if (!frag)
    1829           0 :                 return;
    1830             : 
    1831           0 :         if (frag_offset) {
    1832           0 :                 struct skb_shared_info *info = skb_shinfo(skb);
    1833             : 
    1834             :                 /* We read part of the last frag, must recvmsg() rest of skb. */
    1835           0 :                 if (frag == &info->frags[info->nr_frags - 1])
    1836             :                         return;
    1837             : 
    1838             :                 /* Else, we must at least read the remainder in this frag. */
    1839           0 :                 partial_frag_remainder = skb_frag_size(frag) - frag_offset;
    1840           0 :                 zc->recv_skip_hint -= partial_frag_remainder;
    1841           0 :                 ++frag;
    1842             :         }
    1843             : 
    1844             :         /* partial_frag_remainder: If part way through a frag, must read rest.
    1845             :          * mappable_offset: Bytes till next mappable frag, *not* counting bytes
    1846             :          * in partial_frag_remainder.
    1847             :          */
    1848           0 :         mappable_offset = find_next_mappable_frag(frag, zc->recv_skip_hint);
    1849           0 :         zc->recv_skip_hint = mappable_offset + partial_frag_remainder;
    1850             : }
    1851             : 
    1852             : static int tcp_recvmsg_locked(struct sock *sk, struct msghdr *msg, size_t len,
    1853             :                               int nonblock, int flags,
    1854             :                               struct scm_timestamping_internal *tss,
    1855             :                               int *cmsg_flags);
    1856           0 : static int receive_fallback_to_copy(struct sock *sk,
    1857             :                                     struct tcp_zerocopy_receive *zc, int inq,
    1858             :                                     struct scm_timestamping_internal *tss)
    1859             : {
    1860           0 :         unsigned long copy_address = (unsigned long)zc->copybuf_address;
    1861           0 :         struct msghdr msg = {};
    1862           0 :         struct iovec iov;
    1863           0 :         int err;
    1864             : 
    1865           0 :         zc->length = 0;
    1866           0 :         zc->recv_skip_hint = 0;
    1867             : 
    1868           0 :         if (copy_address != zc->copybuf_address)
    1869             :                 return -EINVAL;
    1870             : 
    1871           0 :         err = import_single_range(READ, (void __user *)copy_address,
    1872             :                                   inq, &iov, &msg.msg_iter);
    1873           0 :         if (err)
    1874             :                 return err;
    1875             : 
    1876           0 :         err = tcp_recvmsg_locked(sk, &msg, inq, /*nonblock=*/1, /*flags=*/0,
    1877           0 :                                  tss, &zc->msg_flags);
    1878           0 :         if (err < 0)
    1879             :                 return err;
    1880             : 
    1881           0 :         zc->copybuf_len = err;
    1882           0 :         if (likely(zc->copybuf_len)) {
    1883           0 :                 struct sk_buff *skb;
    1884           0 :                 u32 offset;
    1885             : 
    1886           0 :                 skb = tcp_recv_skb(sk, tcp_sk(sk)->copied_seq, &offset);
    1887           0 :                 if (skb)
    1888           0 :                         tcp_zerocopy_set_hint_for_skb(sk, zc, skb, offset);
    1889             :         }
    1890             :         return 0;
    1891             : }
    1892             : 
    1893           0 : static int tcp_copy_straggler_data(struct tcp_zerocopy_receive *zc,
    1894             :                                    struct sk_buff *skb, u32 copylen,
    1895             :                                    u32 *offset, u32 *seq)
    1896             : {
    1897           0 :         unsigned long copy_address = (unsigned long)zc->copybuf_address;
    1898           0 :         struct msghdr msg = {};
    1899           0 :         struct iovec iov;
    1900           0 :         int err;
    1901             : 
    1902           0 :         if (copy_address != zc->copybuf_address)
    1903             :                 return -EINVAL;
    1904             : 
    1905           0 :         err = import_single_range(READ, (void __user *)copy_address,
    1906             :                                   copylen, &iov, &msg.msg_iter);
    1907           0 :         if (err)
    1908             :                 return err;
    1909           0 :         err = skb_copy_datagram_msg(skb, *offset, &msg, copylen);
    1910           0 :         if (err)
    1911             :                 return err;
    1912           0 :         zc->recv_skip_hint -= copylen;
    1913           0 :         *offset += copylen;
    1914           0 :         *seq += copylen;
    1915           0 :         return (__s32)copylen;
    1916             : }
    1917             : 
    1918           0 : static int tcp_zc_handle_leftover(struct tcp_zerocopy_receive *zc,
    1919             :                                   struct sock *sk,
    1920             :                                   struct sk_buff *skb,
    1921             :                                   u32 *seq,
    1922             :                                   s32 copybuf_len,
    1923             :                                   struct scm_timestamping_internal *tss)
    1924             : {
    1925           0 :         u32 offset, copylen = min_t(u32, copybuf_len, zc->recv_skip_hint);
    1926             : 
    1927           0 :         if (!copylen)
    1928             :                 return 0;
    1929             :         /* skb is null if inq < PAGE_SIZE. */
    1930           0 :         if (skb) {
    1931           0 :                 offset = *seq - TCP_SKB_CB(skb)->seq;
    1932             :         } else {
    1933           0 :                 skb = tcp_recv_skb(sk, *seq, &offset);
    1934           0 :                 if (TCP_SKB_CB(skb)->has_rxtstamp) {
    1935           0 :                         tcp_update_recv_tstamps(skb, tss);
    1936           0 :                         zc->msg_flags |= TCP_CMSG_TS;
    1937             :                 }
    1938             :         }
    1939             : 
    1940           0 :         zc->copybuf_len = tcp_copy_straggler_data(zc, skb, copylen, &offset,
    1941             :                                                   seq);
    1942           0 :         return zc->copybuf_len < 0 ? 0 : copylen;
    1943             : }
    1944             : 
    1945           0 : static int tcp_zerocopy_vm_insert_batch_error(struct vm_area_struct *vma,
    1946             :                                               struct page **pending_pages,
    1947             :                                               unsigned long pages_remaining,
    1948             :                                               unsigned long *address,
    1949             :                                               u32 *length,
    1950             :                                               u32 *seq,
    1951             :                                               struct tcp_zerocopy_receive *zc,
    1952             :                                               u32 total_bytes_to_map,
    1953             :                                               int err)
    1954             : {
    1955             :         /* At least one page did not map. Try zapping if we skipped earlier. */
    1956           0 :         if (err == -EBUSY &&
    1957           0 :             zc->flags & TCP_RECEIVE_ZEROCOPY_FLAG_TLB_CLEAN_HINT) {
    1958           0 :                 u32 maybe_zap_len;
    1959             : 
    1960           0 :                 maybe_zap_len = total_bytes_to_map -  /* All bytes to map */
    1961           0 :                                 *length + /* Mapped or pending */
    1962             :                                 (pages_remaining * PAGE_SIZE); /* Failed map. */
    1963           0 :                 zap_page_range(vma, *address, maybe_zap_len);
    1964           0 :                 err = 0;
    1965             :         }
    1966             : 
    1967           0 :         if (!err) {
    1968           0 :                 unsigned long leftover_pages = pages_remaining;
    1969           0 :                 int bytes_mapped;
    1970             : 
    1971             :                 /* We called zap_page_range, try to reinsert. */
    1972           0 :                 err = vm_insert_pages(vma, *address,
    1973             :                                       pending_pages,
    1974             :                                       &pages_remaining);
    1975           0 :                 bytes_mapped = PAGE_SIZE * (leftover_pages - pages_remaining);
    1976           0 :                 *seq += bytes_mapped;
    1977           0 :                 *address += bytes_mapped;
    1978             :         }
    1979           0 :         if (err) {
    1980             :                 /* Either we were unable to zap, OR we zapped, retried an
    1981             :                  * insert, and still had an issue. Either ways, pages_remaining
    1982             :                  * is the number of pages we were unable to map, and we unroll
    1983             :                  * some state we speculatively touched before.
    1984             :                  */
    1985           0 :                 const int bytes_not_mapped = PAGE_SIZE * pages_remaining;
    1986             : 
    1987           0 :                 *length -= bytes_not_mapped;
    1988           0 :                 zc->recv_skip_hint += bytes_not_mapped;
    1989             :         }
    1990           0 :         return err;
    1991             : }
    1992             : 
    1993           0 : static int tcp_zerocopy_vm_insert_batch(struct vm_area_struct *vma,
    1994             :                                         struct page **pages,
    1995             :                                         unsigned int pages_to_map,
    1996             :                                         unsigned long *address,
    1997             :                                         u32 *length,
    1998             :                                         u32 *seq,
    1999             :                                         struct tcp_zerocopy_receive *zc,
    2000             :                                         u32 total_bytes_to_map)
    2001             : {
    2002           0 :         unsigned long pages_remaining = pages_to_map;
    2003           0 :         unsigned int pages_mapped;
    2004           0 :         unsigned int bytes_mapped;
    2005           0 :         int err;
    2006             : 
    2007           0 :         err = vm_insert_pages(vma, *address, pages, &pages_remaining);
    2008           0 :         pages_mapped = pages_to_map - (unsigned int)pages_remaining;
    2009           0 :         bytes_mapped = PAGE_SIZE * pages_mapped;
    2010             :         /* Even if vm_insert_pages fails, it may have partially succeeded in
    2011             :          * mapping (some but not all of the pages).
    2012             :          */
    2013           0 :         *seq += bytes_mapped;
    2014           0 :         *address += bytes_mapped;
    2015             : 
    2016           0 :         if (likely(!err))
    2017             :                 return 0;
    2018             : 
    2019             :         /* Error: maybe zap and retry + rollback state for failed inserts. */
    2020           0 :         return tcp_zerocopy_vm_insert_batch_error(vma, pages + pages_mapped,
    2021             :                 pages_remaining, address, length, seq, zc, total_bytes_to_map,
    2022             :                 err);
    2023             : }
    2024             : 
    2025             : #define TCP_VALID_ZC_MSG_FLAGS   (TCP_CMSG_TS)
    2026             : static void tcp_recv_timestamp(struct msghdr *msg, const struct sock *sk,
    2027             :                                struct scm_timestamping_internal *tss);
    2028           0 : static void tcp_zc_finalize_rx_tstamp(struct sock *sk,
    2029             :                                       struct tcp_zerocopy_receive *zc,
    2030             :                                       struct scm_timestamping_internal *tss)
    2031             : {
    2032           0 :         unsigned long msg_control_addr;
    2033           0 :         struct msghdr cmsg_dummy;
    2034             : 
    2035           0 :         msg_control_addr = (unsigned long)zc->msg_control;
    2036           0 :         cmsg_dummy.msg_control = (void *)msg_control_addr;
    2037           0 :         cmsg_dummy.msg_controllen =
    2038           0 :                 (__kernel_size_t)zc->msg_controllen;
    2039           0 :         cmsg_dummy.msg_flags = in_compat_syscall()
    2040           0 :                 ? MSG_CMSG_COMPAT : 0;
    2041           0 :         zc->msg_flags = 0;
    2042           0 :         if (zc->msg_control == msg_control_addr &&
    2043             :             zc->msg_controllen == cmsg_dummy.msg_controllen) {
    2044           0 :                 tcp_recv_timestamp(&cmsg_dummy, sk, tss);
    2045           0 :                 zc->msg_control = (__u64)
    2046           0 :                         ((uintptr_t)cmsg_dummy.msg_control);
    2047           0 :                 zc->msg_controllen =
    2048           0 :                         (__u64)cmsg_dummy.msg_controllen;
    2049           0 :                 zc->msg_flags = (__u32)cmsg_dummy.msg_flags;
    2050             :         }
    2051           0 : }
    2052             : 
    2053             : #define TCP_ZEROCOPY_PAGE_BATCH_SIZE 32
    2054           0 : static int tcp_zerocopy_receive(struct sock *sk,
    2055             :                                 struct tcp_zerocopy_receive *zc,
    2056             :                                 struct scm_timestamping_internal *tss)
    2057             : {
    2058           0 :         u32 length = 0, offset, vma_len, avail_len, copylen = 0;
    2059           0 :         unsigned long address = (unsigned long)zc->address;
    2060           0 :         struct page *pages[TCP_ZEROCOPY_PAGE_BATCH_SIZE];
    2061           0 :         s32 copybuf_len = zc->copybuf_len;
    2062           0 :         struct tcp_sock *tp = tcp_sk(sk);
    2063           0 :         const skb_frag_t *frags = NULL;
    2064           0 :         unsigned int pages_to_map = 0;
    2065           0 :         struct vm_area_struct *vma;
    2066           0 :         struct sk_buff *skb = NULL;
    2067           0 :         u32 seq = tp->copied_seq;
    2068           0 :         u32 total_bytes_to_map;
    2069           0 :         int inq = tcp_inq(sk);
    2070           0 :         int ret;
    2071             : 
    2072           0 :         zc->copybuf_len = 0;
    2073           0 :         zc->msg_flags = 0;
    2074             : 
    2075           0 :         if (address & (PAGE_SIZE - 1) || address != zc->address)
    2076             :                 return -EINVAL;
    2077             : 
    2078           0 :         if (sk->sk_state == TCP_LISTEN)
    2079             :                 return -ENOTCONN;
    2080             : 
    2081           0 :         sock_rps_record_flow(sk);
    2082             : 
    2083           0 :         if (inq && inq <= copybuf_len)
    2084           0 :                 return receive_fallback_to_copy(sk, zc, inq, tss);
    2085             : 
    2086           0 :         if (inq < PAGE_SIZE) {
    2087           0 :                 zc->length = 0;
    2088           0 :                 zc->recv_skip_hint = inq;
    2089           0 :                 if (!inq && sock_flag(sk, SOCK_DONE))
    2090             :                         return -EIO;
    2091           0 :                 return 0;
    2092             :         }
    2093             : 
    2094           0 :         mmap_read_lock(current->mm);
    2095             : 
    2096           0 :         vma = find_vma(current->mm, address);
    2097           0 :         if (!vma || vma->vm_start > address || vma->vm_ops != &tcp_vm_ops) {
    2098           0 :                 mmap_read_unlock(current->mm);
    2099           0 :                 return -EINVAL;
    2100             :         }
    2101           0 :         vma_len = min_t(unsigned long, zc->length, vma->vm_end - address);
    2102           0 :         avail_len = min_t(u32, vma_len, inq);
    2103           0 :         total_bytes_to_map = avail_len & ~(PAGE_SIZE - 1);
    2104           0 :         if (total_bytes_to_map) {
    2105           0 :                 if (!(zc->flags & TCP_RECEIVE_ZEROCOPY_FLAG_TLB_CLEAN_HINT))
    2106           0 :                         zap_page_range(vma, address, total_bytes_to_map);
    2107           0 :                 zc->length = total_bytes_to_map;
    2108           0 :                 zc->recv_skip_hint = 0;
    2109             :         } else {
    2110           0 :                 zc->length = avail_len;
    2111           0 :                 zc->recv_skip_hint = avail_len;
    2112             :         }
    2113             :         ret = 0;
    2114           0 :         while (length + PAGE_SIZE <= zc->length) {
    2115           0 :                 int mappable_offset;
    2116           0 :                 struct page *page;
    2117             : 
    2118           0 :                 if (zc->recv_skip_hint < PAGE_SIZE) {
    2119           0 :                         u32 offset_frag;
    2120             : 
    2121           0 :                         if (skb) {
    2122           0 :                                 if (zc->recv_skip_hint > 0)
    2123             :                                         break;
    2124           0 :                                 skb = skb->next;
    2125           0 :                                 offset = seq - TCP_SKB_CB(skb)->seq;
    2126             :                         } else {
    2127           0 :                                 skb = tcp_recv_skb(sk, seq, &offset);
    2128             :                         }
    2129             : 
    2130           0 :                         if (TCP_SKB_CB(skb)->has_rxtstamp) {
    2131           0 :                                 tcp_update_recv_tstamps(skb, tss);
    2132           0 :                                 zc->msg_flags |= TCP_CMSG_TS;
    2133             :                         }
    2134           0 :                         zc->recv_skip_hint = skb->len - offset;
    2135           0 :                         frags = skb_advance_to_frag(skb, offset, &offset_frag);
    2136           0 :                         if (!frags || offset_frag)
    2137             :                                 break;
    2138             :                 }
    2139             : 
    2140           0 :                 mappable_offset = find_next_mappable_frag(frags,
    2141           0 :                                                           zc->recv_skip_hint);
    2142           0 :                 if (mappable_offset) {
    2143           0 :                         zc->recv_skip_hint = mappable_offset;
    2144           0 :                         break;
    2145             :                 }
    2146           0 :                 page = skb_frag_page(frags);
    2147           0 :                 prefetchw(page);
    2148           0 :                 pages[pages_to_map++] = page;
    2149           0 :                 length += PAGE_SIZE;
    2150           0 :                 zc->recv_skip_hint -= PAGE_SIZE;
    2151           0 :                 frags++;
    2152           0 :                 if (pages_to_map == TCP_ZEROCOPY_PAGE_BATCH_SIZE ||
    2153             :                     zc->recv_skip_hint < PAGE_SIZE) {
    2154             :                         /* Either full batch, or we're about to go to next skb
    2155             :                          * (and we cannot unroll failed ops across skbs).
    2156             :                          */
    2157           0 :                         ret = tcp_zerocopy_vm_insert_batch(vma, pages,
    2158             :                                                            pages_to_map,
    2159             :                                                            &address, &length,
    2160             :                                                            &seq, zc,
    2161             :                                                            total_bytes_to_map);
    2162           0 :                         if (ret)
    2163           0 :                                 goto out;
    2164             :                         pages_to_map = 0;
    2165             :                 }
    2166             :         }
    2167           0 :         if (pages_to_map) {
    2168           0 :                 ret = tcp_zerocopy_vm_insert_batch(vma, pages, pages_to_map,
    2169             :                                                    &address, &length, &seq,
    2170             :                                                    zc, total_bytes_to_map);
    2171             :         }
    2172           0 : out:
    2173           0 :         mmap_read_unlock(current->mm);
    2174             :         /* Try to copy straggler data. */
    2175           0 :         if (!ret)
    2176           0 :                 copylen = tcp_zc_handle_leftover(zc, sk, skb, &seq, copybuf_len, tss);
    2177             : 
    2178           0 :         if (length + copylen) {
    2179           0 :                 WRITE_ONCE(tp->copied_seq, seq);
    2180           0 :                 tcp_rcv_space_adjust(sk);
    2181             : 
    2182             :                 /* Clean up data we have read: This will do ACK frames. */
    2183           0 :                 tcp_recv_skb(sk, seq, &offset);
    2184           0 :                 tcp_cleanup_rbuf(sk, length + copylen);
    2185           0 :                 ret = 0;
    2186           0 :                 if (length == zc->length)
    2187           0 :                         zc->recv_skip_hint = 0;
    2188             :         } else {
    2189           0 :                 if (!zc->recv_skip_hint && sock_flag(sk, SOCK_DONE))
    2190           0 :                         ret = -EIO;
    2191             :         }
    2192           0 :         zc->length = length;
    2193           0 :         return ret;
    2194             : }
    2195             : #endif
    2196             : 
    2197             : /* Similar to __sock_recv_timestamp, but does not require an skb */
    2198           0 : static void tcp_recv_timestamp(struct msghdr *msg, const struct sock *sk,
    2199             :                                struct scm_timestamping_internal *tss)
    2200             : {
    2201           0 :         int new_tstamp = sock_flag(sk, SOCK_TSTAMP_NEW);
    2202           0 :         bool has_timestamping = false;
    2203             : 
    2204           0 :         if (tss->ts[0].tv_sec || tss->ts[0].tv_nsec) {
    2205           0 :                 if (sock_flag(sk, SOCK_RCVTSTAMP)) {
    2206           0 :                         if (sock_flag(sk, SOCK_RCVTSTAMPNS)) {
    2207           0 :                                 if (new_tstamp) {
    2208           0 :                                         struct __kernel_timespec kts = {
    2209           0 :                                                 .tv_sec = tss->ts[0].tv_sec,
    2210           0 :                                                 .tv_nsec = tss->ts[0].tv_nsec,
    2211             :                                         };
    2212           0 :                                         put_cmsg(msg, SOL_SOCKET, SO_TIMESTAMPNS_NEW,
    2213             :                                                  sizeof(kts), &kts);
    2214             :                                 } else {
    2215           0 :                                         struct __kernel_old_timespec ts_old = {
    2216           0 :                                                 .tv_sec = tss->ts[0].tv_sec,
    2217           0 :                                                 .tv_nsec = tss->ts[0].tv_nsec,
    2218             :                                         };
    2219           0 :                                         put_cmsg(msg, SOL_SOCKET, SO_TIMESTAMPNS_OLD,
    2220             :                                                  sizeof(ts_old), &ts_old);
    2221             :                                 }
    2222             :                         } else {
    2223           0 :                                 if (new_tstamp) {
    2224           0 :                                         struct __kernel_sock_timeval stv = {
    2225           0 :                                                 .tv_sec = tss->ts[0].tv_sec,
    2226           0 :                                                 .tv_usec = tss->ts[0].tv_nsec / 1000,
    2227             :                                         };
    2228           0 :                                         put_cmsg(msg, SOL_SOCKET, SO_TIMESTAMP_NEW,
    2229             :                                                  sizeof(stv), &stv);
    2230             :                                 } else {
    2231           0 :                                         struct __kernel_old_timeval tv = {
    2232           0 :                                                 .tv_sec = tss->ts[0].tv_sec,
    2233           0 :                                                 .tv_usec = tss->ts[0].tv_nsec / 1000,
    2234             :                                         };
    2235           0 :                                         put_cmsg(msg, SOL_SOCKET, SO_TIMESTAMP_OLD,
    2236             :                                                  sizeof(tv), &tv);
    2237             :                                 }
    2238             :                         }
    2239             :                 }
    2240             : 
    2241           0 :                 if (sk->sk_tsflags & SOF_TIMESTAMPING_SOFTWARE)
    2242             :                         has_timestamping = true;
    2243             :                 else
    2244           0 :                         tss->ts[0] = (struct timespec64) {0};
    2245             :         }
    2246             : 
    2247           0 :         if (tss->ts[2].tv_sec || tss->ts[2].tv_nsec) {
    2248           0 :                 if (sk->sk_tsflags & SOF_TIMESTAMPING_RAW_HARDWARE)
    2249             :                         has_timestamping = true;
    2250             :                 else
    2251           0 :                         tss->ts[2] = (struct timespec64) {0};
    2252             :         }
    2253             : 
    2254           0 :         if (has_timestamping) {
    2255           0 :                 tss->ts[1] = (struct timespec64) {0};
    2256           0 :                 if (sock_flag(sk, SOCK_TSTAMP_NEW))
    2257           0 :                         put_cmsg_scm_timestamping64(msg, tss);
    2258             :                 else
    2259           0 :                         put_cmsg_scm_timestamping(msg, tss);
    2260             :         }
    2261           0 : }
    2262             : 
    2263           0 : static int tcp_inq_hint(struct sock *sk)
    2264             : {
    2265           0 :         const struct tcp_sock *tp = tcp_sk(sk);
    2266           0 :         u32 copied_seq = READ_ONCE(tp->copied_seq);
    2267           0 :         u32 rcv_nxt = READ_ONCE(tp->rcv_nxt);
    2268           0 :         int inq;
    2269             : 
    2270           0 :         inq = rcv_nxt - copied_seq;
    2271           0 :         if (unlikely(inq < 0 || copied_seq != READ_ONCE(tp->copied_seq))) {
    2272           0 :                 lock_sock(sk);
    2273           0 :                 inq = tp->rcv_nxt - tp->copied_seq;
    2274           0 :                 release_sock(sk);
    2275             :         }
    2276             :         /* After receiving a FIN, tell the user-space to continue reading
    2277             :          * by returning a non-zero inq.
    2278             :          */
    2279           0 :         if (inq == 0 && sock_flag(sk, SOCK_DONE))
    2280           0 :                 inq = 1;
    2281           0 :         return inq;
    2282             : }
    2283             : 
    2284             : /*
    2285             :  *      This routine copies from a sock struct into the user buffer.
    2286             :  *
    2287             :  *      Technical note: in 2.3 we work on _locked_ socket, so that
    2288             :  *      tricks with *seq access order and skb->users are not required.
    2289             :  *      Probably, code can be easily improved even more.
    2290             :  */
    2291             : 
    2292         201 : static int tcp_recvmsg_locked(struct sock *sk, struct msghdr *msg, size_t len,
    2293             :                               int nonblock, int flags,
    2294             :                               struct scm_timestamping_internal *tss,
    2295             :                               int *cmsg_flags)
    2296             : {
    2297         201 :         struct tcp_sock *tp = tcp_sk(sk);
    2298         201 :         int copied = 0;
    2299         201 :         u32 peek_seq;
    2300         201 :         u32 *seq;
    2301         201 :         unsigned long used;
    2302         201 :         int err;
    2303         201 :         int target;             /* Read at least this many bytes */
    2304         201 :         long timeo;
    2305         201 :         struct sk_buff *skb, *last;
    2306         201 :         u32 urg_hole = 0;
    2307             : 
    2308         201 :         err = -ENOTCONN;
    2309         201 :         if (sk->sk_state == TCP_LISTEN)
    2310           0 :                 goto out;
    2311             : 
    2312         201 :         if (tp->recvmsg_inq)
    2313           0 :                 *cmsg_flags = TCP_CMSG_INQ;
    2314         201 :         timeo = sock_rcvtimeo(sk, nonblock);
    2315             : 
    2316             :         /* Urgent data needs to be handled specially. */
    2317         201 :         if (flags & MSG_OOB)
    2318           0 :                 goto recv_urg;
    2319             : 
    2320         201 :         if (unlikely(tp->repair)) {
    2321           0 :                 err = -EPERM;
    2322           0 :                 if (!(flags & MSG_PEEK))
    2323           0 :                         goto out;
    2324             : 
    2325           0 :                 if (tp->repair_queue == TCP_SEND_QUEUE)
    2326           0 :                         goto recv_sndq;
    2327             : 
    2328           0 :                 err = -EINVAL;
    2329           0 :                 if (tp->repair_queue == TCP_NO_QUEUE)
    2330           0 :                         goto out;
    2331             : 
    2332             :                 /* 'common' recv queue MSG_PEEK-ing */
    2333             :         }
    2334             : 
    2335         201 :         seq = &tp->copied_seq;
    2336         201 :         if (flags & MSG_PEEK) {
    2337           0 :                 peek_seq = tp->copied_seq;
    2338           0 :                 seq = &peek_seq;
    2339             :         }
    2340             : 
    2341         402 :         target = sock_rcvlowat(sk, flags & MSG_WAITALL, len);
    2342             : 
    2343         262 :         do {
    2344         262 :                 u32 offset;
    2345             : 
    2346             :                 /* Are we at urgent data? Stop if we have read anything or have SIGURG pending. */
    2347         262 :                 if (tp->urg_data && tp->urg_seq == *seq) {
    2348           0 :                         if (copied)
    2349             :                                 break;
    2350           0 :                         if (signal_pending(current)) {
    2351           0 :                                 copied = timeo ? sock_intr_errno(timeo) : -EAGAIN;
    2352             :                                 break;
    2353             :                         }
    2354             :                 }
    2355             : 
    2356             :                 /* Next get a buffer. */
    2357             : 
    2358         262 :                 last = skb_peek_tail(&sk->sk_receive_queue);
    2359         262 :                 skb_queue_walk(&sk->sk_receive_queue, skb) {
    2360         215 :                         last = skb;
    2361             :                         /* Now that we have two receive queues this
    2362             :                          * shouldn't happen.
    2363             :                          */
    2364         215 :                         if (WARN(before(*seq, TCP_SKB_CB(skb)->seq),
    2365             :                                  "TCP recvmsg seq # bug: copied %X, seq %X, rcvnxt %X, fl %X\n",
    2366             :                                  *seq, TCP_SKB_CB(skb)->seq, tp->rcv_nxt,
    2367             :                                  flags))
    2368             :                                 break;
    2369             : 
    2370         215 :                         offset = *seq - TCP_SKB_CB(skb)->seq;
    2371         215 :                         if (unlikely(TCP_SKB_CB(skb)->tcp_flags & TCPHDR_SYN)) {
    2372           0 :                                 pr_err_once("%s: found a SYN, please report !\n", __func__);
    2373           0 :                                 offset--;
    2374             :                         }
    2375         215 :                         if (offset < skb->len)
    2376         215 :                                 goto found_ok_skb;
    2377           0 :                         if (TCP_SKB_CB(skb)->tcp_flags & TCPHDR_FIN)
    2378           0 :                                 goto found_fin_ok;
    2379           0 :                         WARN(!(flags & MSG_PEEK),
    2380             :                              "TCP recvmsg seq # bug 2: copied %X, seq %X, rcvnxt %X, fl %X\n",
    2381             :                              *seq, TCP_SKB_CB(skb)->seq, tp->rcv_nxt, flags);
    2382             :                 }
    2383             : 
    2384             :                 /* Well, if we have backlog, try to process it now yet. */
    2385             : 
    2386          47 :                 if (copied >= target && !READ_ONCE(sk->sk_backlog.tail))
    2387             :                         break;
    2388             : 
    2389           3 :                 if (copied) {
    2390           3 :                         if (sk->sk_err ||
    2391           3 :                             sk->sk_state == TCP_CLOSE ||
    2392           3 :                             (sk->sk_shutdown & RCV_SHUTDOWN) ||
    2393           3 :                             !timeo ||
    2394           0 :                             signal_pending(current))
    2395             :                                 break;
    2396             :                 } else {
    2397           0 :                         if (sock_flag(sk, SOCK_DONE))
    2398             :                                 break;
    2399             : 
    2400           0 :                         if (sk->sk_err) {
    2401           0 :                                 copied = sock_error(sk);
    2402           0 :                                 break;
    2403             :                         }
    2404             : 
    2405           0 :                         if (sk->sk_shutdown & RCV_SHUTDOWN)
    2406             :                                 break;
    2407             : 
    2408           0 :                         if (sk->sk_state == TCP_CLOSE) {
    2409             :                                 /* This occurs when user tries to read
    2410             :                                  * from never connected socket.
    2411             :                                  */
    2412             :                                 copied = -ENOTCONN;
    2413             :                                 break;
    2414             :                         }
    2415             : 
    2416           0 :                         if (!timeo) {
    2417             :                                 copied = -EAGAIN;
    2418             :                                 break;
    2419             :                         }
    2420             : 
    2421           0 :                         if (signal_pending(current)) {
    2422           0 :                                 copied = sock_intr_errno(timeo);
    2423             :                                 break;
    2424             :                         }
    2425             :                 }
    2426             : 
    2427           0 :                 tcp_cleanup_rbuf(sk, copied);
    2428             : 
    2429           0 :                 if (copied >= target) {
    2430             :                         /* Do not sleep, just process backlog. */
    2431           0 :                         release_sock(sk);
    2432           0 :                         lock_sock(sk);
    2433             :                 } else {
    2434           0 :                         sk_wait_data(sk, &timeo, last);
    2435             :                 }
    2436             : 
    2437           0 :                 if ((flags & MSG_PEEK) &&
    2438           0 :                     (peek_seq - copied - urg_hole != tp->copied_seq)) {
    2439           0 :                         net_dbg_ratelimited("TCP(%s:%d): Application bug, race in MSG_PEEK\n",
    2440             :                                             current->comm,
    2441             :                                             task_pid_nr(current));
    2442           0 :                         peek_seq = tp->copied_seq;
    2443             :                 }
    2444           0 :                 continue;
    2445             : 
    2446         215 : found_ok_skb:
    2447             :                 /* Ok so how much can we use? */
    2448         215 :                 used = skb->len - offset;
    2449         215 :                 if (len < used)
    2450             :                         used = len;
    2451             : 
    2452             :                 /* Do we have urgent data here? */
    2453         215 :                 if (tp->urg_data) {
    2454           0 :                         u32 urg_offset = tp->urg_seq - *seq;
    2455           0 :                         if (urg_offset < used) {
    2456           0 :                                 if (!urg_offset) {
    2457           0 :                                         if (!sock_flag(sk, SOCK_URGINLINE)) {
    2458           0 :                                                 WRITE_ONCE(*seq, *seq + 1);
    2459           0 :                                                 urg_hole++;
    2460           0 :                                                 offset++;
    2461           0 :                                                 used--;
    2462           0 :                                                 if (!used)
    2463           0 :                                                         goto skip_copy;
    2464             :                                         }
    2465             :                                 } else
    2466             :                                         used = urg_offset;
    2467             :                         }
    2468             :                 }
    2469             : 
    2470         215 :                 if (!(flags & MSG_TRUNC)) {
    2471         215 :                         err = skb_copy_datagram_msg(skb, offset, msg, used);
    2472         215 :                         if (err) {
    2473             :                                 /* Exception. Bailout! */
    2474           0 :                                 if (!copied)
    2475           0 :                                         copied = -EFAULT;
    2476             :                                 break;
    2477             :                         }
    2478             :                 }
    2479             : 
    2480         215 :                 WRITE_ONCE(*seq, *seq + used);
    2481         215 :                 copied += used;
    2482         215 :                 len -= used;
    2483             : 
    2484         215 :                 tcp_rcv_space_adjust(sk);
    2485             : 
    2486         215 : skip_copy:
    2487         215 :                 if (tp->urg_data && after(tp->copied_seq, tp->urg_seq)) {
    2488           0 :                         tp->urg_data = 0;
    2489           0 :                         tcp_fast_path_check(sk);
    2490             :                 }
    2491             : 
    2492         215 :                 if (TCP_SKB_CB(skb)->has_rxtstamp) {
    2493           0 :                         tcp_update_recv_tstamps(skb, tss);
    2494           0 :                         *cmsg_flags |= TCP_CMSG_TS;
    2495             :                 }
    2496             : 
    2497         215 :                 if (used + offset < skb->len)
    2498         148 :                         continue;
    2499             : 
    2500          67 :                 if (TCP_SKB_CB(skb)->tcp_flags & TCPHDR_FIN)
    2501           2 :                         goto found_fin_ok;
    2502          65 :                 if (!(flags & MSG_PEEK))
    2503          65 :                         sk_eat_skb(sk, skb);
    2504          65 :                 continue;
    2505             : 
    2506           2 : found_fin_ok:
    2507             :                 /* Process the FIN. */
    2508           2 :                 WRITE_ONCE(*seq, *seq + 1);
    2509           2 :                 if (!(flags & MSG_PEEK))
    2510           2 :                         sk_eat_skb(sk, skb);
    2511             :                 break;
    2512         213 :         } while (len > 0);
    2513             : 
    2514             :         /* According to UNIX98, msg_name/msg_namelen are ignored
    2515             :          * on connected socket. I was just happy when found this 8) --ANK
    2516             :          */
    2517             : 
    2518             :         /* Clean up data we have read: This will do ACK frames. */
    2519         201 :         tcp_cleanup_rbuf(sk, copied);
    2520         201 :         return copied;
    2521             : 
    2522             : out:
    2523             :         return err;
    2524             : 
    2525           0 : recv_urg:
    2526           0 :         err = tcp_recv_urg(sk, msg, len, flags);
    2527           0 :         goto out;
    2528             : 
    2529           0 : recv_sndq:
    2530           0 :         err = tcp_peek_sndq(sk, msg, len);
    2531           0 :         goto out;
    2532             : }
    2533             : 
    2534         201 : int tcp_recvmsg(struct sock *sk, struct msghdr *msg, size_t len, int nonblock,
    2535             :                 int flags, int *addr_len)
    2536             : {
    2537         201 :         int cmsg_flags = 0, ret, inq;
    2538         201 :         struct scm_timestamping_internal tss;
    2539             : 
    2540         201 :         if (unlikely(flags & MSG_ERRQUEUE))
    2541           0 :                 return inet_recv_error(sk, msg, len, addr_len);
    2542             : 
    2543         201 :         if (sk_can_busy_loop(sk) &&
    2544           0 :             skb_queue_empty_lockless(&sk->sk_receive_queue) &&
    2545           0 :             sk->sk_state == TCP_ESTABLISHED)
    2546           0 :                 sk_busy_loop(sk, nonblock);
    2547             : 
    2548         201 :         lock_sock(sk);
    2549         201 :         ret = tcp_recvmsg_locked(sk, msg, len, nonblock, flags, &tss,
    2550             :                                  &cmsg_flags);
    2551         201 :         release_sock(sk);
    2552             : 
    2553         201 :         if (cmsg_flags && ret >= 0) {
    2554           0 :                 if (cmsg_flags & TCP_CMSG_TS)
    2555           0 :                         tcp_recv_timestamp(msg, sk, &tss);
    2556           0 :                 if (cmsg_flags & TCP_CMSG_INQ) {
    2557           0 :                         inq = tcp_inq_hint(sk);
    2558           0 :                         put_cmsg(msg, SOL_TCP, TCP_CM_INQ, sizeof(inq), &inq);
    2559             :                 }
    2560             :         }
    2561             :         return ret;
    2562             : }
    2563             : EXPORT_SYMBOL(tcp_recvmsg);
    2564             : 
    2565          14 : void tcp_set_state(struct sock *sk, int state)
    2566             : {
    2567          14 :         int oldstate = sk->sk_state;
    2568             : 
    2569             :         /* We defined a new enum for TCP states that are exported in BPF
    2570             :          * so as not force the internal TCP states to be frozen. The
    2571             :          * following checks will detect if an internal state value ever
    2572             :          * differs from the BPF value. If this ever happens, then we will
    2573             :          * need to remap the internal value to the BPF value before calling
    2574             :          * tcp_call_bpf_2arg.
    2575             :          */
    2576          14 :         BUILD_BUG_ON((int)BPF_TCP_ESTABLISHED != (int)TCP_ESTABLISHED);
    2577          14 :         BUILD_BUG_ON((int)BPF_TCP_SYN_SENT != (int)TCP_SYN_SENT);
    2578          14 :         BUILD_BUG_ON((int)BPF_TCP_SYN_RECV != (int)TCP_SYN_RECV);
    2579          14 :         BUILD_BUG_ON((int)BPF_TCP_FIN_WAIT1 != (int)TCP_FIN_WAIT1);
    2580          14 :         BUILD_BUG_ON((int)BPF_TCP_FIN_WAIT2 != (int)TCP_FIN_WAIT2);
    2581          14 :         BUILD_BUG_ON((int)BPF_TCP_TIME_WAIT != (int)TCP_TIME_WAIT);
    2582          14 :         BUILD_BUG_ON((int)BPF_TCP_CLOSE != (int)TCP_CLOSE);
    2583          14 :         BUILD_BUG_ON((int)BPF_TCP_CLOSE_WAIT != (int)TCP_CLOSE_WAIT);
    2584          14 :         BUILD_BUG_ON((int)BPF_TCP_LAST_ACK != (int)TCP_LAST_ACK);
    2585          14 :         BUILD_BUG_ON((int)BPF_TCP_LISTEN != (int)TCP_LISTEN);
    2586          14 :         BUILD_BUG_ON((int)BPF_TCP_CLOSING != (int)TCP_CLOSING);
    2587          14 :         BUILD_BUG_ON((int)BPF_TCP_NEW_SYN_RECV != (int)TCP_NEW_SYN_RECV);
    2588          14 :         BUILD_BUG_ON((int)BPF_TCP_MAX_STATES != (int)TCP_MAX_STATES);
    2589             : 
    2590          14 :         if (BPF_SOCK_OPS_TEST_FLAG(tcp_sk(sk), BPF_SOCK_OPS_STATE_CB_FLAG))
    2591           0 :                 tcp_call_bpf_2arg(sk, BPF_SOCK_OPS_STATE_CB, oldstate, state);
    2592             : 
    2593          14 :         switch (state) {
    2594           4 :         case TCP_ESTABLISHED:
    2595           4 :                 if (oldstate != TCP_ESTABLISHED)
    2596           4 :                         TCP_INC_STATS(sock_net(sk), TCP_MIB_CURRESTAB);
    2597             :                 break;
    2598             : 
    2599           4 :         case TCP_CLOSE:
    2600           4 :                 if (oldstate == TCP_CLOSE_WAIT || oldstate == TCP_ESTABLISHED)
    2601           4 :                         TCP_INC_STATS(sock_net(sk), TCP_MIB_ESTABRESETS);
    2602             : 
    2603           4 :                 sk->sk_prot->unhash(sk);
    2604           4 :                 if (inet_csk(sk)->icsk_bind_hash &&
    2605           4 :                     !(sk->sk_userlocks & SOCK_BINDPORT_LOCK))
    2606           3 :                         inet_put_port(sk);
    2607          10 :                 fallthrough;
    2608             :         default:
    2609          10 :                 if (oldstate == TCP_ESTABLISHED)
    2610          10 :                         TCP_DEC_STATS(sock_net(sk), TCP_MIB_CURRESTAB);
    2611             :         }
    2612             : 
    2613             :         /* Change state AFTER socket is unhashed to avoid closed
    2614             :          * socket sitting in hash tables.
    2615             :          */
    2616          14 :         inet_sk_state_store(sk, state);
    2617          14 : }
    2618             : EXPORT_SYMBOL_GPL(tcp_set_state);
    2619             : 
    2620             : /*
    2621             :  *      State processing on a close. This implements the state shift for
    2622             :  *      sending our FIN frame. Note that we only send a FIN for some
    2623             :  *      states. A shutdown() may have already sent the FIN, or we may be
    2624             :  *      closed.
    2625             :  */
    2626             : 
    2627             : static const unsigned char new_state[16] = {
    2628             :   /* current state:        new state:      action:      */
    2629             :   [0 /* (Invalid) */]   = TCP_CLOSE,
    2630             :   [TCP_ESTABLISHED]     = TCP_FIN_WAIT1 | TCP_ACTION_FIN,
    2631             :   [TCP_SYN_SENT]        = TCP_CLOSE,
    2632             :   [TCP_SYN_RECV]        = TCP_FIN_WAIT1 | TCP_ACTION_FIN,
    2633             :   [TCP_FIN_WAIT1]       = TCP_FIN_WAIT1,
    2634             :   [TCP_FIN_WAIT2]       = TCP_FIN_WAIT2,
    2635             :   [TCP_TIME_WAIT]       = TCP_CLOSE,
    2636             :   [TCP_CLOSE]           = TCP_CLOSE,
    2637             :   [TCP_CLOSE_WAIT]      = TCP_LAST_ACK  | TCP_ACTION_FIN,
    2638             :   [TCP_LAST_ACK]        = TCP_LAST_ACK,
    2639             :   [TCP_LISTEN]          = TCP_CLOSE,
    2640             :   [TCP_CLOSING]         = TCP_CLOSING,
    2641             :   [TCP_NEW_SYN_RECV]    = TCP_CLOSE,    /* should not happen ! */
    2642             : };
    2643             : 
    2644           3 : static int tcp_close_state(struct sock *sk)
    2645             : {
    2646           3 :         int next = (int)new_state[sk->sk_state];
    2647           3 :         int ns = next & TCP_STATE_MASK;
    2648             : 
    2649           3 :         tcp_set_state(sk, ns);
    2650             : 
    2651           3 :         return next & TCP_ACTION_FIN;
    2652             : }
    2653             : 
    2654             : /*
    2655             :  *      Shutdown the sending side of a connection. Much like close except
    2656             :  *      that we don't receive shut down or sock_set_flag(sk, SOCK_DEAD).
    2657             :  */
    2658             : 
    2659           0 : void tcp_shutdown(struct sock *sk, int how)
    2660             : {
    2661             :         /*      We need to grab some memory, and put together a FIN,
    2662             :          *      and then put it into the queue to be sent.
    2663             :          *              Tim MacKenzie(tym@dibbler.cs.monash.edu.au) 4 Dec '92.
    2664             :          */
    2665           0 :         if (!(how & SEND_SHUTDOWN))
    2666             :                 return;
    2667             : 
    2668             :         /* If we've already sent a FIN, or it's a closed state, skip this. */
    2669           0 :         if ((1 << sk->sk_state) &
    2670             :             (TCPF_ESTABLISHED | TCPF_SYN_SENT |
    2671             :              TCPF_SYN_RECV | TCPF_CLOSE_WAIT)) {
    2672             :                 /* Clear out any half completed packets.  FIN if needed. */
    2673           0 :                 if (tcp_close_state(sk))
    2674           0 :                         tcp_send_fin(sk);
    2675             :         }
    2676             : }
    2677             : EXPORT_SYMBOL(tcp_shutdown);
    2678             : 
    2679           3 : bool tcp_check_oom(struct sock *sk, int shift)
    2680             : {
    2681           3 :         bool too_many_orphans, out_of_socket_memory;
    2682             : 
    2683           3 :         too_many_orphans = tcp_too_many_orphans(sk, shift);
    2684           3 :         out_of_socket_memory = tcp_out_of_memory(sk);
    2685             : 
    2686           3 :         if (too_many_orphans)
    2687           0 :                 net_info_ratelimited("too many orphaned sockets\n");
    2688           3 :         if (out_of_socket_memory)
    2689           0 :                 net_info_ratelimited("out of memory -- consider tuning tcp_mem\n");
    2690           3 :         return too_many_orphans || out_of_socket_memory;
    2691             : }
    2692             : 
    2693           4 : void __tcp_close(struct sock *sk, long timeout)
    2694             : {
    2695           4 :         struct sk_buff *skb;
    2696           4 :         int data_was_unread = 0;
    2697           4 :         int state;
    2698             : 
    2699           4 :         sk->sk_shutdown = SHUTDOWN_MASK;
    2700             : 
    2701           4 :         if (sk->sk_state == TCP_LISTEN) {
    2702           1 :                 tcp_set_state(sk, TCP_CLOSE);
    2703             : 
    2704             :                 /* Special case. */
    2705           1 :                 inet_csk_listen_stop(sk);
    2706             : 
    2707           1 :                 goto adjudge_to_death;
    2708             :         }
    2709             : 
    2710             :         /*  We need to flush the recv. buffs.  We do this only on the
    2711             :          *  descriptor close, not protocol-sourced closes, because the
    2712             :          *  reader process may not have drained the data yet!
    2713             :          */
    2714           4 :         while ((skb = __skb_dequeue(&sk->sk_receive_queue)) != NULL) {
    2715           1 :                 u32 len = TCP_SKB_CB(skb)->end_seq - TCP_SKB_CB(skb)->seq;
    2716             : 
    2717           1 :                 if (TCP_SKB_CB(skb)->tcp_flags & TCPHDR_FIN)
    2718           1 :                         len--;
    2719           1 :                 data_was_unread += len;
    2720           1 :                 __kfree_skb(skb);
    2721             :         }
    2722             : 
    2723           3 :         sk_mem_reclaim(sk);
    2724             : 
    2725             :         /* If socket has been already reset (e.g. in tcp_reset()) - kill it. */
    2726           3 :         if (sk->sk_state == TCP_CLOSE)
    2727           0 :                 goto adjudge_to_death;
    2728             : 
    2729             :         /* As outlined in RFC 2525, section 2.17, we send a RST here because
    2730             :          * data was lost. To witness the awful effects of the old behavior of
    2731             :          * always doing a FIN, run an older 2.1.x kernel or 2.0.x, start a bulk
    2732             :          * GET in an FTP client, suspend the process, wait for the client to
    2733             :          * advertise a zero window, then kill -9 the FTP client, wheee...
    2734             :          * Note: timeout is always zero in such a case.
    2735             :          */
    2736           3 :         if (unlikely(tcp_sk(sk)->repair)) {
    2737           0 :                 sk->sk_prot->disconnect(sk, 0);
    2738           3 :         } else if (data_was_unread) {
    2739             :                 /* Unread data was tossed, zap the connection. */
    2740           0 :                 NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPABORTONCLOSE);
    2741           0 :                 tcp_set_state(sk, TCP_CLOSE);
    2742           0 :                 tcp_send_active_reset(sk, sk->sk_allocation);
    2743           3 :         } else if (sock_flag(sk, SOCK_LINGER) && !sk->sk_lingertime) {
    2744             :                 /* Check zero linger _after_ checking for unread data. */
    2745           0 :                 sk->sk_prot->disconnect(sk, 0);
    2746           0 :                 NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPABORTONDATA);
    2747           3 :         } else if (tcp_close_state(sk)) {
    2748             :                 /* We FIN if the application ate all the data before
    2749             :                  * zapping the connection.
    2750             :                  */
    2751             : 
    2752             :                 /* RED-PEN. Formally speaking, we have broken TCP state
    2753             :                  * machine. State transitions:
    2754             :                  *
    2755             :                  * TCP_ESTABLISHED -> TCP_FIN_WAIT1
    2756             :                  * TCP_SYN_RECV -> TCP_FIN_WAIT1 (forget it, it's impossible)
    2757             :                  * TCP_CLOSE_WAIT -> TCP_LAST_ACK
    2758             :                  *
    2759             :                  * are legal only when FIN has been sent (i.e. in window),
    2760             :                  * rather than queued out of window. Purists blame.
    2761             :                  *
    2762             :                  * F.e. "RFC state" is ESTABLISHED,
    2763             :                  * if Linux state is FIN-WAIT-1, but FIN is still not sent.
    2764             :                  *
    2765             :                  * The visible declinations are that sometimes
    2766             :                  * we enter time-wait state, when it is not required really
    2767             :                  * (harmless), do not send active resets, when they are
    2768             :                  * required by specs (TCP_ESTABLISHED, TCP_CLOSE_WAIT, when
    2769             :                  * they look as CLOSING or LAST_ACK for Linux)
    2770             :                  * Probably, I missed some more holelets.
    2771             :                  *                                              --ANK
    2772             :                  * XXX (TFO) - To start off we don't support SYN+ACK+FIN
    2773             :                  * in a single packet! (May consider it later but will
    2774             :                  * probably need API support or TCP_CORK SYN-ACK until
    2775             :                  * data is written and socket is closed.)
    2776             :                  */
    2777           3 :                 tcp_send_fin(sk);
    2778             :         }
    2779             : 
    2780           3 :         sk_stream_wait_close(sk, timeout);
    2781             : 
    2782           4 : adjudge_to_death:
    2783           4 :         state = sk->sk_state;
    2784           4 :         sock_hold(sk);
    2785           4 :         sock_orphan(sk);
    2786             : 
    2787           4 :         local_bh_disable();
    2788           4 :         bh_lock_sock(sk);
    2789             :         /* remove backlog if any, without releasing ownership. */
    2790           4 :         __release_sock(sk);
    2791             : 
    2792           4 :         percpu_counter_inc(sk->sk_prot->orphan_count);
    2793             : 
    2794             :         /* Have we already been destroyed by a softirq or backlog? */
    2795           4 :         if (state != TCP_CLOSE && sk->sk_state == TCP_CLOSE)
    2796           0 :                 goto out;
    2797             : 
    2798             :         /*      This is a (useful) BSD violating of the RFC. There is a
    2799             :          *      problem with TCP as specified in that the other end could
    2800             :          *      keep a socket open forever with no application left this end.
    2801             :          *      We use a 1 minute timeout (about the same as BSD) then kill
    2802             :          *      our end. If they send after that then tough - BUT: long enough
    2803             :          *      that we won't make the old 4*rto = almost no time - whoops
    2804             :          *      reset mistake.
    2805             :          *
    2806             :          *      Nope, it was not mistake. It is really desired behaviour
    2807             :          *      f.e. on http servers, when such sockets are useless, but
    2808             :          *      consume significant resources. Let's do it with special
    2809             :          *      linger2 option.                                 --ANK
    2810             :          */
    2811             : 
    2812           4 :         if (sk->sk_state == TCP_FIN_WAIT2) {
    2813           0 :                 struct tcp_sock *tp = tcp_sk(sk);
    2814           0 :                 if (tp->linger2 < 0) {
    2815           0 :                         tcp_set_state(sk, TCP_CLOSE);
    2816           0 :                         tcp_send_active_reset(sk, GFP_ATOMIC);
    2817           0 :                         __NET_INC_STATS(sock_net(sk),
    2818             :                                         LINUX_MIB_TCPABORTONLINGER);
    2819             :                 } else {
    2820           0 :                         const int tmo = tcp_fin_time(sk);
    2821             : 
    2822           0 :                         if (tmo > TCP_TIMEWAIT_LEN) {
    2823           0 :                                 inet_csk_reset_keepalive_timer(sk,
    2824           0 :                                                 tmo - TCP_TIMEWAIT_LEN);
    2825             :                         } else {
    2826           0 :                                 tcp_time_wait(sk, TCP_FIN_WAIT2, tmo);
    2827           0 :                                 goto out;
    2828             :                         }
    2829             :                 }
    2830             :         }
    2831           4 :         if (sk->sk_state != TCP_CLOSE) {
    2832           3 :                 sk_mem_reclaim(sk);
    2833           3 :                 if (tcp_check_oom(sk, 0)) {
    2834           0 :                         tcp_set_state(sk, TCP_CLOSE);
    2835           0 :                         tcp_send_active_reset(sk, GFP_ATOMIC);
    2836           0 :                         __NET_INC_STATS(sock_net(sk),
    2837             :                                         LINUX_MIB_TCPABORTONMEMORY);
    2838           4 :                 } else if (!check_net(sock_net(sk))) {
    2839             :                         /* Not possible to send reset; just close */
    2840             :                         tcp_set_state(sk, TCP_CLOSE);
    2841             :                 }
    2842             :         }
    2843             : 
    2844           4 :         if (sk->sk_state == TCP_CLOSE) {
    2845           1 :                 struct request_sock *req;
    2846             : 
    2847           1 :                 req = rcu_dereference_protected(tcp_sk(sk)->fastopen_rsk,
    2848             :                                                 lockdep_sock_is_held(sk));
    2849             :                 /* We could get here with a non-NULL req if the socket is
    2850             :                  * aborted (e.g., closed with unread data) before 3WHS
    2851             :                  * finishes.
    2852             :                  */
    2853           1 :                 if (req)
    2854           0 :                         reqsk_fastopen_remove(sk, req, false);
    2855           1 :                 inet_csk_destroy_sock(sk);
    2856             :         }
    2857             :         /* Otherwise, socket is reprieved until protocol close. */
    2858             : 
    2859           3 : out:
    2860           4 :         bh_unlock_sock(sk);
    2861           4 :         local_bh_enable();
    2862           4 : }
    2863             : 
    2864           4 : void tcp_close(struct sock *sk, long timeout)
    2865             : {
    2866           4 :         lock_sock(sk);
    2867           4 :         __tcp_close(sk, timeout);
    2868           4 :         release_sock(sk);
    2869           4 :         sock_put(sk);
    2870           4 : }
    2871             : EXPORT_SYMBOL(tcp_close);
    2872             : 
    2873             : /* These states need RST on ABORT according to RFC793 */
    2874             : 
    2875           0 : static inline bool tcp_need_reset(int state)
    2876             : {
    2877           0 :         return (1 << state) &
    2878             :                (TCPF_ESTABLISHED | TCPF_CLOSE_WAIT | TCPF_FIN_WAIT1 |
    2879             :                 TCPF_FIN_WAIT2 | TCPF_SYN_RECV);
    2880             : }
    2881             : 
    2882           4 : static void tcp_rtx_queue_purge(struct sock *sk)
    2883             : {
    2884           4 :         struct rb_node *p = rb_first(&sk->tcp_rtx_queue);
    2885             : 
    2886           4 :         tcp_sk(sk)->highest_sack = NULL;
    2887           4 :         while (p) {
    2888           0 :                 struct sk_buff *skb = rb_to_skb(p);
    2889             : 
    2890           0 :                 p = rb_next(p);
    2891             :                 /* Since we are deleting whole queue, no need to
    2892             :                  * list_del(&skb->tcp_tsorted_anchor)
    2893             :                  */
    2894           0 :                 tcp_rtx_queue_unlink(skb, sk);
    2895           0 :                 sk_wmem_free_skb(sk, skb);
    2896             :         }
    2897           4 : }
    2898             : 
    2899           4 : void tcp_write_queue_purge(struct sock *sk)
    2900             : {
    2901           4 :         struct sk_buff *skb;
    2902             : 
    2903           4 :         tcp_chrono_stop(sk, TCP_CHRONO_BUSY);
    2904           4 :         while ((skb = __skb_dequeue(&sk->sk_write_queue)) != NULL) {
    2905           0 :                 tcp_skb_tsorted_anchor_cleanup(skb);
    2906           0 :                 sk_wmem_free_skb(sk, skb);
    2907             :         }
    2908           4 :         tcp_rtx_queue_purge(sk);
    2909           4 :         skb = sk->sk_tx_skb_cache;
    2910           4 :         if (skb) {
    2911           0 :                 __kfree_skb(skb);
    2912           0 :                 sk->sk_tx_skb_cache = NULL;
    2913             :         }
    2914           4 :         INIT_LIST_HEAD(&tcp_sk(sk)->tsorted_sent_queue);
    2915           4 :         sk_mem_reclaim(sk);
    2916           4 :         tcp_clear_all_retrans_hints(tcp_sk(sk));
    2917           4 :         tcp_sk(sk)->packets_out = 0;
    2918           4 :         inet_csk(sk)->icsk_backoff = 0;
    2919           4 : }
    2920             : 
    2921           0 : int tcp_disconnect(struct sock *sk, int flags)
    2922             : {
    2923           0 :         struct inet_sock *inet = inet_sk(sk);
    2924           0 :         struct inet_connection_sock *icsk = inet_csk(sk);
    2925           0 :         struct tcp_sock *tp = tcp_sk(sk);
    2926           0 :         int old_state = sk->sk_state;
    2927           0 :         u32 seq;
    2928             : 
    2929           0 :         if (old_state != TCP_CLOSE)
    2930           0 :                 tcp_set_state(sk, TCP_CLOSE);
    2931             : 
    2932             :         /* ABORT function of RFC793 */
    2933           0 :         if (old_state == TCP_LISTEN) {
    2934           0 :                 inet_csk_listen_stop(sk);
    2935           0 :         } else if (unlikely(tp->repair)) {
    2936           0 :                 sk->sk_err = ECONNABORTED;
    2937           0 :         } else if (tcp_need_reset(old_state) ||
    2938           0 :                    (tp->snd_nxt != tp->write_seq &&
    2939             :                     (1 << old_state) & (TCPF_CLOSING | TCPF_LAST_ACK))) {
    2940             :                 /* The last check adjusts for discrepancy of Linux wrt. RFC
    2941             :                  * states
    2942             :                  */
    2943           0 :                 tcp_send_active_reset(sk, gfp_any());
    2944           0 :                 sk->sk_err = ECONNRESET;
    2945           0 :         } else if (old_state == TCP_SYN_SENT)
    2946           0 :                 sk->sk_err = ECONNRESET;
    2947             : 
    2948           0 :         tcp_clear_xmit_timers(sk);
    2949           0 :         __skb_queue_purge(&sk->sk_receive_queue);
    2950           0 :         if (sk->sk_rx_skb_cache) {
    2951           0 :                 __kfree_skb(sk->sk_rx_skb_cache);
    2952           0 :                 sk->sk_rx_skb_cache = NULL;
    2953             :         }
    2954           0 :         WRITE_ONCE(tp->copied_seq, tp->rcv_nxt);
    2955           0 :         tp->urg_data = 0;
    2956           0 :         tcp_write_queue_purge(sk);
    2957           0 :         tcp_fastopen_active_disable_ofo_check(sk);
    2958           0 :         skb_rbtree_purge(&tp->out_of_order_queue);
    2959             : 
    2960           0 :         inet->inet_dport = 0;
    2961             : 
    2962           0 :         if (!(sk->sk_userlocks & SOCK_BINDADDR_LOCK))
    2963           0 :                 inet_reset_saddr(sk);
    2964             : 
    2965           0 :         sk->sk_shutdown = 0;
    2966           0 :         sock_reset_flag(sk, SOCK_DONE);
    2967           0 :         tp->srtt_us = 0;
    2968           0 :         tp->mdev_us = jiffies_to_usecs(TCP_TIMEOUT_INIT);
    2969           0 :         tp->rcv_rtt_last_tsecr = 0;
    2970             : 
    2971           0 :         seq = tp->write_seq + tp->max_window + 2;
    2972           0 :         if (!seq)
    2973             :                 seq = 1;
    2974           0 :         WRITE_ONCE(tp->write_seq, seq);
    2975             : 
    2976           0 :         icsk->icsk_backoff = 0;
    2977           0 :         icsk->icsk_probes_out = 0;
    2978           0 :         icsk->icsk_probes_tstamp = 0;
    2979           0 :         icsk->icsk_rto = TCP_TIMEOUT_INIT;
    2980           0 :         icsk->icsk_rto_min = TCP_RTO_MIN;
    2981           0 :         icsk->icsk_delack_max = TCP_DELACK_MAX;
    2982           0 :         tp->snd_ssthresh = TCP_INFINITE_SSTHRESH;
    2983           0 :         tp->snd_cwnd = TCP_INIT_CWND;
    2984           0 :         tp->snd_cwnd_cnt = 0;
    2985           0 :         tp->window_clamp = 0;
    2986           0 :         tp->delivered = 0;
    2987           0 :         tp->delivered_ce = 0;
    2988           0 :         if (icsk->icsk_ca_ops->release)
    2989           0 :                 icsk->icsk_ca_ops->release(sk);
    2990           0 :         memset(icsk->icsk_ca_priv, 0, sizeof(icsk->icsk_ca_priv));
    2991           0 :         icsk->icsk_ca_initialized = 0;
    2992           0 :         tcp_set_ca_state(sk, TCP_CA_Open);
    2993           0 :         tp->is_sack_reneg = 0;
    2994           0 :         tcp_clear_retrans(tp);
    2995           0 :         tp->total_retrans = 0;
    2996           0 :         inet_csk_delack_init(sk);
    2997             :         /* Initialize rcv_mss to TCP_MIN_MSS to avoid division by 0
    2998             :          * issue in __tcp_select_window()
    2999             :          */
    3000           0 :         icsk->icsk_ack.rcv_mss = TCP_MIN_MSS;
    3001           0 :         memset(&tp->rx_opt, 0, sizeof(tp->rx_opt));
    3002           0 :         __sk_dst_reset(sk);
    3003           0 :         dst_release(sk->sk_rx_dst);
    3004           0 :         sk->sk_rx_dst = NULL;
    3005           0 :         tcp_saved_syn_free(tp);
    3006           0 :         tp->compressed_ack = 0;
    3007           0 :         tp->segs_in = 0;
    3008           0 :         tp->segs_out = 0;
    3009           0 :         tp->bytes_sent = 0;
    3010           0 :         tp->bytes_acked = 0;
    3011           0 :         tp->bytes_received = 0;
    3012           0 :         tp->bytes_retrans = 0;
    3013           0 :         tp->data_segs_in = 0;
    3014           0 :         tp->data_segs_out = 0;
    3015           0 :         tp->duplicate_sack[0].start_seq = 0;
    3016           0 :         tp->duplicate_sack[0].end_seq = 0;
    3017           0 :         tp->dsack_dups = 0;
    3018           0 :         tp->reord_seen = 0;
    3019           0 :         tp->retrans_out = 0;
    3020           0 :         tp->sacked_out = 0;
    3021           0 :         tp->tlp_high_seq = 0;
    3022           0 :         tp->last_oow_ack_time = 0;
    3023             :         /* There's a bubble in the pipe until at least the first ACK. */
    3024           0 :         tp->app_limited = ~0U;
    3025           0 :         tp->rack.mstamp = 0;
    3026           0 :         tp->rack.advanced = 0;
    3027           0 :         tp->rack.reo_wnd_steps = 1;
    3028           0 :         tp->rack.last_delivered = 0;
    3029           0 :         tp->rack.reo_wnd_persist = 0;
    3030           0 :         tp->rack.dsack_seen = 0;
    3031           0 :         tp->syn_data_acked = 0;
    3032           0 :         tp->rx_opt.saw_tstamp = 0;
    3033           0 :         tp->rx_opt.dsack = 0;
    3034           0 :         tp->rx_opt.num_sacks = 0;
    3035           0 :         tp->rcv_ooopack = 0;
    3036             : 
    3037             : 
    3038             :         /* Clean up fastopen related fields */
    3039           0 :         tcp_free_fastopen_req(tp);
    3040           0 :         inet->defer_connect = 0;
    3041           0 :         tp->fastopen_client_fail = 0;
    3042             : 
    3043           0 :         WARN_ON(inet->inet_num && !icsk->icsk_bind_hash);
    3044             : 
    3045           0 :         if (sk->sk_frag.page) {
    3046           0 :                 put_page(sk->sk_frag.page);
    3047           0 :                 sk->sk_frag.page = NULL;
    3048           0 :                 sk->sk_frag.offset = 0;
    3049             :         }
    3050             : 
    3051           0 :         sk->sk_error_report(sk);
    3052           0 :         return 0;
    3053             : }
    3054             : EXPORT_SYMBOL(tcp_disconnect);
    3055             : 
    3056           0 : static inline bool tcp_can_repair_sock(const struct sock *sk)
    3057             : {
    3058           0 :         return ns_capable(sock_net(sk)->user_ns, CAP_NET_ADMIN) &&
    3059           0 :                 (sk->sk_state != TCP_LISTEN);
    3060             : }
    3061             : 
    3062           0 : static int tcp_repair_set_window(struct tcp_sock *tp, sockptr_t optbuf, int len)
    3063             : {
    3064           0 :         struct tcp_repair_window opt;
    3065             : 
    3066           0 :         if (!tp->repair)
    3067             :                 return -EPERM;
    3068             : 
    3069           0 :         if (len != sizeof(opt))
    3070             :                 return -EINVAL;
    3071             : 
    3072           0 :         if (copy_from_sockptr(&opt, optbuf, sizeof(opt)))
    3073             :                 return -EFAULT;
    3074             : 
    3075           0 :         if (opt.max_window < opt.snd_wnd)
    3076             :                 return -EINVAL;
    3077             : 
    3078           0 :         if (after(opt.snd_wl1, tp->rcv_nxt + opt.rcv_wnd))
    3079             :                 return -EINVAL;
    3080             : 
    3081           0 :         if (after(opt.rcv_wup, tp->rcv_nxt))
    3082             :                 return -EINVAL;
    3083             : 
    3084           0 :         tp->snd_wl1  = opt.snd_wl1;
    3085           0 :         tp->snd_wnd  = opt.snd_wnd;
    3086           0 :         tp->max_window       = opt.max_window;
    3087             : 
    3088           0 :         tp->rcv_wnd  = opt.rcv_wnd;
    3089           0 :         tp->rcv_wup  = opt.rcv_wup;
    3090             : 
    3091           0 :         return 0;
    3092             : }
    3093             : 
    3094           0 : static int tcp_repair_options_est(struct sock *sk, sockptr_t optbuf,
    3095             :                 unsigned int len)
    3096             : {
    3097           0 :         struct tcp_sock *tp = tcp_sk(sk);
    3098           0 :         struct tcp_repair_opt opt;
    3099           0 :         size_t offset = 0;
    3100             : 
    3101           0 :         while (len >= sizeof(opt)) {
    3102           0 :                 if (copy_from_sockptr_offset(&opt, optbuf, offset, sizeof(opt)))
    3103             :                         return -EFAULT;
    3104             : 
    3105           0 :                 offset += sizeof(opt);
    3106           0 :                 len -= sizeof(opt);
    3107             : 
    3108           0 :                 switch (opt.opt_code) {
    3109           0 :                 case TCPOPT_MSS:
    3110           0 :                         tp->rx_opt.mss_clamp = opt.opt_val;
    3111           0 :                         tcp_mtup_init(sk);
    3112           0 :                         break;
    3113           0 :                 case TCPOPT_WINDOW:
    3114             :                         {
    3115           0 :                                 u16 snd_wscale = opt.opt_val & 0xFFFF;
    3116           0 :                                 u16 rcv_wscale = opt.opt_val >> 16;
    3117             : 
    3118           0 :                                 if (snd_wscale > TCP_MAX_WSCALE || rcv_wscale > TCP_MAX_WSCALE)
    3119             :                                         return -EFBIG;
    3120             : 
    3121           0 :                                 tp->rx_opt.snd_wscale = snd_wscale;
    3122           0 :                                 tp->rx_opt.rcv_wscale = rcv_wscale;
    3123           0 :                                 tp->rx_opt.wscale_ok = 1;
    3124             :                         }
    3125           0 :                         break;
    3126           0 :                 case TCPOPT_SACK_PERM:
    3127           0 :                         if (opt.opt_val != 0)
    3128             :                                 return -EINVAL;
    3129             : 
    3130           0 :                         tp->rx_opt.sack_ok |= TCP_SACK_SEEN;
    3131           0 :                         break;
    3132           0 :                 case TCPOPT_TIMESTAMP:
    3133           0 :                         if (opt.opt_val != 0)
    3134             :                                 return -EINVAL;
    3135             : 
    3136           0 :                         tp->rx_opt.tstamp_ok = 1;
    3137           0 :                         break;
    3138             :                 }
    3139             :         }
    3140             : 
    3141             :         return 0;
    3142             : }
    3143             : 
    3144             : DEFINE_STATIC_KEY_FALSE(tcp_tx_delay_enabled);
    3145             : EXPORT_SYMBOL(tcp_tx_delay_enabled);
    3146             : 
    3147           0 : static void tcp_enable_tx_delay(void)
    3148             : {
    3149           0 :         if (!static_branch_unlikely(&tcp_tx_delay_enabled)) {
    3150           0 :                 static int __tcp_tx_delay_enabled = 0;
    3151             : 
    3152           0 :                 if (cmpxchg(&__tcp_tx_delay_enabled, 0, 1) == 0) {
    3153           0 :                         static_branch_enable(&tcp_tx_delay_enabled);
    3154           0 :                         pr_info("TCP_TX_DELAY enabled\n");
    3155             :                 }
    3156             :         }
    3157           0 : }
    3158             : 
    3159             : /* When set indicates to always queue non-full frames.  Later the user clears
    3160             :  * this option and we transmit any pending partial frames in the queue.  This is
    3161             :  * meant to be used alongside sendfile() to get properly filled frames when the
    3162             :  * user (for example) must write out headers with a write() call first and then
    3163             :  * use sendfile to send out the data parts.
    3164             :  *
    3165             :  * TCP_CORK can be set together with TCP_NODELAY and it is stronger than
    3166             :  * TCP_NODELAY.
    3167             :  */
    3168           0 : static void __tcp_sock_set_cork(struct sock *sk, bool on)
    3169             : {
    3170           0 :         struct tcp_sock *tp = tcp_sk(sk);
    3171             : 
    3172           0 :         if (on) {
    3173           0 :                 tp->nonagle |= TCP_NAGLE_CORK;
    3174             :         } else {
    3175           0 :                 tp->nonagle &= ~TCP_NAGLE_CORK;
    3176           0 :                 if (tp->nonagle & TCP_NAGLE_OFF)
    3177           0 :                         tp->nonagle |= TCP_NAGLE_PUSH;
    3178           0 :                 tcp_push_pending_frames(sk);
    3179             :         }
    3180           0 : }
    3181             : 
    3182           0 : void tcp_sock_set_cork(struct sock *sk, bool on)
    3183             : {
    3184           0 :         lock_sock(sk);
    3185           0 :         __tcp_sock_set_cork(sk, on);
    3186           0 :         release_sock(sk);
    3187           0 : }
    3188             : EXPORT_SYMBOL(tcp_sock_set_cork);
    3189             : 
    3190             : /* TCP_NODELAY is weaker than TCP_CORK, so that this option on corked socket is
    3191             :  * remembered, but it is not activated until cork is cleared.
    3192             :  *
    3193             :  * However, when TCP_NODELAY is set we make an explicit push, which overrides
    3194             :  * even TCP_CORK for currently queued segments.
    3195             :  */
    3196           5 : static void __tcp_sock_set_nodelay(struct sock *sk, bool on)
    3197             : {
    3198           5 :         if (on) {
    3199           5 :                 tcp_sk(sk)->nonagle |= TCP_NAGLE_OFF|TCP_NAGLE_PUSH;
    3200           5 :                 tcp_push_pending_frames(sk);
    3201             :         } else {
    3202           0 :                 tcp_sk(sk)->nonagle &= ~TCP_NAGLE_OFF;
    3203             :         }
    3204           5 : }
    3205             : 
    3206           0 : void tcp_sock_set_nodelay(struct sock *sk)
    3207             : {
    3208           0 :         lock_sock(sk);
    3209           0 :         __tcp_sock_set_nodelay(sk, true);
    3210           0 :         release_sock(sk);
    3211           0 : }
    3212             : EXPORT_SYMBOL(tcp_sock_set_nodelay);
    3213             : 
    3214           0 : static void __tcp_sock_set_quickack(struct sock *sk, int val)
    3215             : {
    3216           0 :         if (!val) {
    3217           0 :                 inet_csk_enter_pingpong_mode(sk);
    3218           0 :                 return;
    3219             :         }
    3220             : 
    3221           0 :         inet_csk_exit_pingpong_mode(sk);
    3222           0 :         if ((1 << sk->sk_state) & (TCPF_ESTABLISHED | TCPF_CLOSE_WAIT) &&
    3223           0 :             inet_csk_ack_scheduled(sk)) {
    3224           0 :                 inet_csk(sk)->icsk_ack.pending |= ICSK_ACK_PUSHED;
    3225           0 :                 tcp_cleanup_rbuf(sk, 1);
    3226           0 :                 if (!(val & 1))
    3227           0 :                         inet_csk_enter_pingpong_mode(sk);
    3228             :         }
    3229             : }
    3230             : 
    3231           0 : void tcp_sock_set_quickack(struct sock *sk, int val)
    3232             : {
    3233           0 :         lock_sock(sk);
    3234           0 :         __tcp_sock_set_quickack(sk, val);
    3235           0 :         release_sock(sk);
    3236           0 : }
    3237             : EXPORT_SYMBOL(tcp_sock_set_quickack);
    3238             : 
    3239           0 : int tcp_sock_set_syncnt(struct sock *sk, int val)
    3240             : {
    3241           0 :         if (val < 1 || val > MAX_TCP_SYNCNT)
    3242             :                 return -EINVAL;
    3243             : 
    3244           0 :         lock_sock(sk);
    3245           0 :         inet_csk(sk)->icsk_syn_retries = val;
    3246           0 :         release_sock(sk);
    3247           0 :         return 0;
    3248             : }
    3249             : EXPORT_SYMBOL(tcp_sock_set_syncnt);
    3250             : 
    3251           0 : void tcp_sock_set_user_timeout(struct sock *sk, u32 val)
    3252             : {
    3253           0 :         lock_sock(sk);
    3254           0 :         inet_csk(sk)->icsk_user_timeout = val;
    3255           0 :         release_sock(sk);
    3256           0 : }
    3257             : EXPORT_SYMBOL(tcp_sock_set_user_timeout);
    3258             : 
    3259           0 : int tcp_sock_set_keepidle_locked(struct sock *sk, int val)
    3260             : {
    3261           0 :         struct tcp_sock *tp = tcp_sk(sk);
    3262             : 
    3263           0 :         if (val < 1 || val > MAX_TCP_KEEPIDLE)
    3264             :                 return -EINVAL;
    3265             : 
    3266           0 :         tp->keepalive_time = val * HZ;
    3267           0 :         if (sock_flag(sk, SOCK_KEEPOPEN) &&
    3268           0 :             !((1 << sk->sk_state) & (TCPF_CLOSE | TCPF_LISTEN))) {
    3269           0 :                 u32 elapsed = keepalive_time_elapsed(tp);
    3270             : 
    3271           0 :                 if (tp->keepalive_time > elapsed)
    3272           0 :                         elapsed = tp->keepalive_time - elapsed;
    3273             :                 else
    3274             :                         elapsed = 0;
    3275           0 :                 inet_csk_reset_keepalive_timer(sk, elapsed);
    3276             :         }
    3277             : 
    3278             :         return 0;
    3279             : }
    3280             : 
    3281           0 : int tcp_sock_set_keepidle(struct sock *sk, int val)
    3282             : {
    3283           0 :         int err;
    3284             : 
    3285           0 :         lock_sock(sk);
    3286           0 :         err = tcp_sock_set_keepidle_locked(sk, val);
    3287           0 :         release_sock(sk);
    3288           0 :         return err;
    3289             : }
    3290             : EXPORT_SYMBOL(tcp_sock_set_keepidle);
    3291             : 
    3292           0 : int tcp_sock_set_keepintvl(struct sock *sk, int val)
    3293             : {
    3294           0 :         if (val < 1 || val > MAX_TCP_KEEPINTVL)
    3295             :                 return -EINVAL;
    3296             : 
    3297           0 :         lock_sock(sk);
    3298           0 :         tcp_sk(sk)->keepalive_intvl = val * HZ;
    3299           0 :         release_sock(sk);
    3300           0 :         return 0;
    3301             : }
    3302             : EXPORT_SYMBOL(tcp_sock_set_keepintvl);
    3303             : 
    3304           0 : int tcp_sock_set_keepcnt(struct sock *sk, int val)
    3305             : {
    3306           0 :         if (val < 1 || val > MAX_TCP_KEEPCNT)
    3307             :                 return -EINVAL;
    3308             : 
    3309           0 :         lock_sock(sk);
    3310           0 :         tcp_sk(sk)->keepalive_probes = val;
    3311           0 :         release_sock(sk);
    3312           0 :         return 0;
    3313             : }
    3314             : EXPORT_SYMBOL(tcp_sock_set_keepcnt);
    3315             : 
    3316           0 : int tcp_set_window_clamp(struct sock *sk, int val)
    3317             : {
    3318           0 :         struct tcp_sock *tp = tcp_sk(sk);
    3319             : 
    3320           0 :         if (!val) {
    3321           0 :                 if (sk->sk_state != TCP_CLOSE)
    3322             :                         return -EINVAL;
    3323           0 :                 tp->window_clamp = 0;
    3324             :         } else {
    3325           0 :                 tp->window_clamp = val < SOCK_MIN_RCVBUF / 2 ?
    3326             :                         SOCK_MIN_RCVBUF / 2 : val;
    3327             :         }
    3328             :         return 0;
    3329             : }
    3330             : 
    3331             : /*
    3332             :  *      Socket option code for TCP.
    3333             :  */
    3334           6 : static int do_tcp_setsockopt(struct sock *sk, int level, int optname,
    3335             :                 sockptr_t optval, unsigned int optlen)
    3336             : {
    3337           6 :         struct tcp_sock *tp = tcp_sk(sk);
    3338           6 :         struct inet_connection_sock *icsk = inet_csk(sk);
    3339           6 :         struct net *net = sock_net(sk);
    3340           6 :         int val;
    3341           6 :         int err = 0;
    3342             : 
    3343             :         /* These are data/string values, all the others are ints */
    3344           6 :         switch (optname) {
    3345           0 :         case TCP_CONGESTION: {
    3346           0 :                 char name[TCP_CA_NAME_MAX];
    3347             : 
    3348           0 :                 if (optlen < 1)
    3349             :                         return -EINVAL;
    3350             : 
    3351           0 :                 val = strncpy_from_sockptr(name, optval,
    3352           0 :                                         min_t(long, TCP_CA_NAME_MAX-1, optlen));
    3353           0 :                 if (val < 0)
    3354             :                         return -EFAULT;
    3355           0 :                 name[val] = 0;
    3356             : 
    3357           0 :                 lock_sock(sk);
    3358           0 :                 err = tcp_set_congestion_control(sk, name, true,
    3359           0 :                                                  ns_capable(sock_net(sk)->user_ns,
    3360             :                                                             CAP_NET_ADMIN));
    3361           0 :                 release_sock(sk);
    3362           0 :                 return err;
    3363             :         }
    3364           0 :         case TCP_ULP: {
    3365           0 :                 char name[TCP_ULP_NAME_MAX];
    3366             : 
    3367           0 :                 if (optlen < 1)
    3368             :                         return -EINVAL;
    3369             : 
    3370           0 :                 val = strncpy_from_sockptr(name, optval,
    3371           0 :                                         min_t(long, TCP_ULP_NAME_MAX - 1,
    3372             :                                               optlen));
    3373           0 :                 if (val < 0)
    3374             :                         return -EFAULT;
    3375           0 :                 name[val] = 0;
    3376             : 
    3377           0 :                 lock_sock(sk);
    3378           0 :                 err = tcp_set_ulp(sk, name);
    3379           0 :                 release_sock(sk);
    3380           0 :                 return err;
    3381             :         }
    3382           0 :         case TCP_FASTOPEN_KEY: {
    3383           0 :                 __u8 key[TCP_FASTOPEN_KEY_BUF_LENGTH];
    3384           0 :                 __u8 *backup_key = NULL;
    3385             : 
    3386             :                 /* Allow a backup key as well to facilitate key rotation
    3387             :                  * First key is the active one.
    3388             :                  */
    3389           0 :                 if (optlen != TCP_FASTOPEN_KEY_LENGTH &&
    3390           0 :                     optlen != TCP_FASTOPEN_KEY_BUF_LENGTH)
    3391             :                         return -EINVAL;
    3392             : 
    3393           0 :                 if (copy_from_sockptr(key, optval, optlen))
    3394             :                         return -EFAULT;
    3395             : 
    3396           0 :                 if (optlen == TCP_FASTOPEN_KEY_BUF_LENGTH)
    3397           0 :                         backup_key = key + TCP_FASTOPEN_KEY_LENGTH;
    3398             : 
    3399           0 :                 return tcp_fastopen_reset_cipher(net, sk, key, backup_key);
    3400             :         }
    3401             :         default:
    3402             :                 /* fallthru */
    3403           6 :                 break;
    3404             :         }
    3405             : 
    3406           6 :         if (optlen < sizeof(int))
    3407             :                 return -EINVAL;
    3408             : 
    3409           6 :         if (copy_from_sockptr(&val, optval, sizeof(val)))
    3410             :                 return -EFAULT;
    3411             : 
    3412           6 :         lock_sock(sk);
    3413             : 
    3414           6 :         switch (optname) {
    3415           0 :         case TCP_MAXSEG:
    3416             :                 /* Values greater than interface MTU won't take effect. However
    3417             :                  * at the point when this call is done we typically don't yet
    3418             :                  * know which interface is going to be used
    3419             :                  */
    3420           0 :                 if (val && (val < TCP_MIN_MSS || val > MAX_TCP_WINDOW)) {
    3421             :                         err = -EINVAL;
    3422             :                         break;
    3423             :                 }
    3424           0 :                 tp->rx_opt.user_mss = val;
    3425           0 :                 break;
    3426             : 
    3427           5 :         case TCP_NODELAY:
    3428           5 :                 __tcp_sock_set_nodelay(sk, val);
    3429           5 :                 break;
    3430             : 
    3431           0 :         case TCP_THIN_LINEAR_TIMEOUTS:
    3432           0 :                 if (val < 0 || val > 1)
    3433             :                         err = -EINVAL;
    3434             :                 else
    3435           0 :                         tp->thin_lto = val;
    3436             :                 break;
    3437             : 
    3438           0 :         case TCP_THIN_DUPACK:
    3439           0 :                 if (val < 0 || val > 1)
    3440           0 :                         err = -EINVAL;
    3441             :                 break;
    3442             : 
    3443           0 :         case TCP_REPAIR:
    3444           0 :                 if (!tcp_can_repair_sock(sk))
    3445             :                         err = -EPERM;
    3446           0 :                 else if (val == TCP_REPAIR_ON) {
    3447           0 :                         tp->repair = 1;
    3448           0 :                         sk->sk_reuse = SK_FORCE_REUSE;
    3449           0 :                         tp->repair_queue = TCP_NO_QUEUE;
    3450           0 :                 } else if (val == TCP_REPAIR_OFF) {
    3451           0 :                         tp->repair = 0;
    3452           0 :                         sk->sk_reuse = SK_NO_REUSE;
    3453           0 :                         tcp_send_window_probe(sk);
    3454           0 :                 } else if (val == TCP_REPAIR_OFF_NO_WP) {
    3455           0 :                         tp->repair = 0;
    3456           0 :                         sk->sk_reuse = SK_NO_REUSE;
    3457             :                 } else
    3458             :                         err = -EINVAL;
    3459             : 
    3460             :                 break;
    3461             : 
    3462           0 :         case TCP_REPAIR_QUEUE:
    3463           0 :                 if (!tp->repair)
    3464             :                         err = -EPERM;
    3465           0 :                 else if ((unsigned int)val < TCP_QUEUES_NR)
    3466           0 :                         tp->repair_queue = val;
    3467             :                 else
    3468             :                         err = -EINVAL;
    3469             :                 break;
    3470             : 
    3471           0 :         case TCP_QUEUE_SEQ:
    3472           0 :                 if (sk->sk_state != TCP_CLOSE) {
    3473             :                         err = -EPERM;
    3474           0 :                 } else if (tp->repair_queue == TCP_SEND_QUEUE) {
    3475           0 :                         if (!tcp_rtx_queue_empty(sk))
    3476             :                                 err = -EPERM;
    3477             :                         else
    3478           0 :                                 WRITE_ONCE(tp->write_seq, val);
    3479           0 :                 } else if (tp->repair_queue == TCP_RECV_QUEUE) {
    3480           0 :                         if (tp->rcv_nxt != tp->copied_seq) {
    3481             :                                 err = -EPERM;
    3482             :                         } else {
    3483           0 :                                 WRITE_ONCE(tp->rcv_nxt, val);
    3484           0 :                                 WRITE_ONCE(tp->copied_seq, val);
    3485             :                         }
    3486             :                 } else {
    3487             :                         err = -EINVAL;
    3488             :                 }
    3489             :                 break;
    3490             : 
    3491           0 :         case TCP_REPAIR_OPTIONS:
    3492           0 :                 if (!tp->repair)
    3493             :                         err = -EINVAL;
    3494           0 :                 else if (sk->sk_state == TCP_ESTABLISHED)
    3495           0 :                         err = tcp_repair_options_est(sk, optval, optlen);
    3496             :                 else
    3497             :                         err = -EPERM;
    3498             :                 break;
    3499             : 
    3500           0 :         case TCP_CORK:
    3501           0 :                 __tcp_sock_set_cork(sk, val);
    3502           0 :                 break;
    3503             : 
    3504           0 :         case TCP_KEEPIDLE:
    3505           0 :                 err = tcp_sock_set_keepidle_locked(sk, val);
    3506           0 :                 break;
    3507           0 :         case TCP_KEEPINTVL:
    3508           0 :                 if (val < 1 || val > MAX_TCP_KEEPINTVL)
    3509             :                         err = -EINVAL;
    3510             :                 else
    3511           0 :                         tp->keepalive_intvl = val * HZ;
    3512             :                 break;
    3513           0 :         case TCP_KEEPCNT:
    3514           0 :                 if (val < 1 || val > MAX_TCP_KEEPCNT)
    3515             :                         err = -EINVAL;
    3516             :                 else
    3517           0 :                         tp->keepalive_probes = val;
    3518             :                 break;
    3519           0 :         case TCP_SYNCNT:
    3520           0 :                 if (val < 1 || val > MAX_TCP_SYNCNT)
    3521             :                         err = -EINVAL;
    3522             :                 else
    3523           0 :                         icsk->icsk_syn_retries = val;
    3524             :                 break;
    3525             : 
    3526           0 :         case TCP_SAVE_SYN:
    3527             :                 /* 0: disable, 1: enable, 2: start from ether_header */
    3528           0 :                 if (val < 0 || val > 2)
    3529             :                         err = -EINVAL;
    3530             :                 else
    3531           0 :                         tp->save_syn = val;
    3532             :                 break;
    3533             : 
    3534           0 :         case TCP_LINGER2:
    3535           0 :                 if (val < 0)
    3536           0 :                         tp->linger2 = -1;
    3537           0 :                 else if (val > TCP_FIN_TIMEOUT_MAX / HZ)
    3538           0 :                         tp->linger2 = TCP_FIN_TIMEOUT_MAX;
    3539             :                 else
    3540           0 :                         tp->linger2 = val * HZ;
    3541             :                 break;
    3542             : 
    3543           0 :         case TCP_DEFER_ACCEPT:
    3544             :                 /* Translate value in seconds to number of retransmits */
    3545           0 :                 icsk->icsk_accept_queue.rskq_defer_accept =
    3546           0 :                         secs_to_retrans(val, TCP_TIMEOUT_INIT / HZ,
    3547             :                                         TCP_RTO_MAX / HZ);
    3548           0 :                 break;
    3549             : 
    3550           0 :         case TCP_WINDOW_CLAMP:
    3551           0 :                 err = tcp_set_window_clamp(sk, val);
    3552             :                 break;
    3553             : 
    3554           0 :         case TCP_QUICKACK:
    3555           0 :                 __tcp_sock_set_quickack(sk, val);
    3556           0 :                 break;
    3557             : 
    3558             : #ifdef CONFIG_TCP_MD5SIG
    3559             :         case TCP_MD5SIG:
    3560             :         case TCP_MD5SIG_EXT:
    3561             :                 err = tp->af_specific->md5_parse(sk, optname, optval, optlen);
    3562             :                 break;
    3563             : #endif
    3564           0 :         case TCP_USER_TIMEOUT:
    3565             :                 /* Cap the max time in ms TCP will retry or probe the window
    3566             :                  * before giving up and aborting (ETIMEDOUT) a connection.
    3567             :                  */
    3568           0 :                 if (val < 0)
    3569             :                         err = -EINVAL;
    3570             :                 else
    3571           0 :                         icsk->icsk_user_timeout = val;
    3572             :                 break;
    3573             : 
    3574           1 :         case TCP_FASTOPEN:
    3575           1 :                 if (val >= 0 && ((1 << sk->sk_state) & (TCPF_CLOSE |
    3576             :                     TCPF_LISTEN))) {
    3577           1 :                         tcp_fastopen_init_key_once(net);
    3578             : 
    3579           1 :                         fastopen_queue_tune(sk, val);
    3580             :                 } else {
    3581             :                         err = -EINVAL;
    3582             :                 }
    3583             :                 break;
    3584           0 :         case TCP_FASTOPEN_CONNECT:
    3585           0 :                 if (val > 1 || val < 0) {
    3586             :                         err = -EINVAL;
    3587           0 :                 } else if (net->ipv4.sysctl_tcp_fastopen & TFO_CLIENT_ENABLE) {
    3588           0 :                         if (sk->sk_state == TCP_CLOSE)
    3589           0 :                                 tp->fastopen_connect = val;
    3590             :                         else
    3591             :                                 err = -EINVAL;
    3592             :                 } else {
    3593             :                         err = -EOPNOTSUPP;
    3594             :                 }
    3595             :                 break;
    3596           0 :         case TCP_FASTOPEN_NO_COOKIE:
    3597           0 :                 if (val > 1 || val < 0)
    3598             :                         err = -EINVAL;
    3599           0 :                 else if (!((1 << sk->sk_state) & (TCPF_CLOSE | TCPF_LISTEN)))
    3600             :                         err = -EINVAL;
    3601             :                 else
    3602           0 :                         tp->fastopen_no_cookie = val;
    3603             :                 break;
    3604           0 :         case TCP_TIMESTAMP:
    3605           0 :                 if (!tp->repair)
    3606             :                         err = -EPERM;
    3607             :                 else
    3608           0 :                         tp->tsoffset = val - tcp_time_stamp_raw();
    3609             :                 break;
    3610           0 :         case TCP_REPAIR_WINDOW:
    3611           0 :                 err = tcp_repair_set_window(tp, optval, optlen);
    3612           0 :                 break;
    3613           0 :         case TCP_NOTSENT_LOWAT:
    3614           0 :                 tp->notsent_lowat = val;
    3615           0 :                 sk->sk_write_space(sk);
    3616           0 :                 break;
    3617           0 :         case TCP_INQ:
    3618           0 :                 if (val > 1 || val < 0)
    3619             :                         err = -EINVAL;
    3620             :                 else
    3621           0 :                         tp->recvmsg_inq = val;
    3622             :                 break;
    3623           0 :         case TCP_TX_DELAY:
    3624           0 :                 if (val)
    3625           0 :                         tcp_enable_tx_delay();
    3626           0 :                 tp->tcp_tx_delay = val;
    3627           0 :                 break;
    3628             :         default:
    3629             :                 err = -ENOPROTOOPT;
    3630             :                 break;
    3631             :         }
    3632             : 
    3633           6 :         release_sock(sk);
    3634           6 :         return err;
    3635             : }
    3636             : 
    3637          10 : int tcp_setsockopt(struct sock *sk, int level, int optname, sockptr_t optval,
    3638             :                    unsigned int optlen)
    3639             : {
    3640          10 :         const struct inet_connection_sock *icsk = inet_csk(sk);
    3641             : 
    3642          10 :         if (level != SOL_TCP)
    3643           4 :                 return icsk->icsk_af_ops->setsockopt(sk, level, optname,
    3644             :                                                      optval, optlen);
    3645           6 :         return do_tcp_setsockopt(sk, level, optname, optval, optlen);
    3646             : }
    3647             : EXPORT_SYMBOL(tcp_setsockopt);
    3648             : 
    3649           0 : static void tcp_get_info_chrono_stats(const struct tcp_sock *tp,
    3650             :                                       struct tcp_info *info)
    3651             : {
    3652           0 :         u64 stats[__TCP_CHRONO_MAX], total = 0;
    3653           0 :         enum tcp_chrono i;
    3654             : 
    3655           0 :         for (i = TCP_CHRONO_BUSY; i < __TCP_CHRONO_MAX; ++i) {
    3656           0 :                 stats[i] = tp->chrono_stat[i - 1];
    3657           0 :                 if (i == tp->chrono_type)
    3658           0 :                         stats[i] += tcp_jiffies32 - tp->chrono_start;
    3659           0 :                 stats[i] *= USEC_PER_SEC / HZ;
    3660           0 :                 total += stats[i];
    3661             :         }
    3662             : 
    3663           0 :         info->tcpi_busy_time = total;
    3664           0 :         info->tcpi_rwnd_limited = stats[TCP_CHRONO_RWND_LIMITED];
    3665           0 :         info->tcpi_sndbuf_limited = stats[TCP_CHRONO_SNDBUF_LIMITED];
    3666           0 : }
    3667             : 
    3668             : /* Return information about state of tcp endpoint in API format. */
    3669           0 : void tcp_get_info(struct sock *sk, struct tcp_info *info)
    3670             : {
    3671           0 :         const struct tcp_sock *tp = tcp_sk(sk); /* iff sk_type == SOCK_STREAM */
    3672           0 :         const struct inet_connection_sock *icsk = inet_csk(sk);
    3673           0 :         unsigned long rate;
    3674           0 :         u32 now;
    3675           0 :         u64 rate64;
    3676           0 :         bool slow;
    3677             : 
    3678           0 :         memset(info, 0, sizeof(*info));
    3679           0 :         if (sk->sk_type != SOCK_STREAM)
    3680             :                 return;
    3681             : 
    3682           0 :         info->tcpi_state = inet_sk_state_load(sk);
    3683             : 
    3684             :         /* Report meaningful fields for all TCP states, including listeners */
    3685           0 :         rate = READ_ONCE(sk->sk_pacing_rate);
    3686           0 :         rate64 = (rate != ~0UL) ? rate : ~0ULL;
    3687           0 :         info->tcpi_pacing_rate = rate64;
    3688             : 
    3689           0 :         rate = READ_ONCE(sk->sk_max_pacing_rate);
    3690           0 :         rate64 = (rate != ~0UL) ? rate : ~0ULL;
    3691           0 :         info->tcpi_max_pacing_rate = rate64;
    3692             : 
    3693           0 :         info->tcpi_reordering = tp->reordering;
    3694           0 :         info->tcpi_snd_cwnd = tp->snd_cwnd;
    3695             : 
    3696           0 :         if (info->tcpi_state == TCP_LISTEN) {
    3697             :                 /* listeners aliased fields :
    3698             :                  * tcpi_unacked -> Number of children ready for accept()
    3699             :                  * tcpi_sacked  -> max backlog
    3700             :                  */
    3701           0 :                 info->tcpi_unacked = READ_ONCE(sk->sk_ack_backlog);
    3702           0 :                 info->tcpi_sacked = READ_ONCE(sk->sk_max_ack_backlog);
    3703           0 :                 return;
    3704             :         }
    3705             : 
    3706           0 :         slow = lock_sock_fast(sk);
    3707             : 
    3708           0 :         info->tcpi_ca_state = icsk->icsk_ca_state;
    3709           0 :         info->tcpi_retransmits = icsk->icsk_retransmits;
    3710           0 :         info->tcpi_probes = icsk->icsk_probes_out;
    3711           0 :         info->tcpi_backoff = icsk->icsk_backoff;
    3712             : 
    3713           0 :         if (tp->rx_opt.tstamp_ok)
    3714           0 :                 info->tcpi_options |= TCPI_OPT_TIMESTAMPS;
    3715           0 :         if (tcp_is_sack(tp))
    3716           0 :                 info->tcpi_options |= TCPI_OPT_SACK;
    3717           0 :         if (tp->rx_opt.wscale_ok) {
    3718           0 :                 info->tcpi_options |= TCPI_OPT_WSCALE;
    3719           0 :                 info->tcpi_snd_wscale = tp->rx_opt.snd_wscale;
    3720           0 :                 info->tcpi_rcv_wscale = tp->rx_opt.rcv_wscale;
    3721             :         }
    3722             : 
    3723           0 :         if (tp->ecn_flags & TCP_ECN_OK)
    3724           0 :                 info->tcpi_options |= TCPI_OPT_ECN;
    3725           0 :         if (tp->ecn_flags & TCP_ECN_SEEN)
    3726           0 :                 info->tcpi_options |= TCPI_OPT_ECN_SEEN;
    3727           0 :         if (tp->syn_data_acked)
    3728           0 :                 info->tcpi_options |= TCPI_OPT_SYN_DATA;
    3729             : 
    3730           0 :         info->tcpi_rto = jiffies_to_usecs(icsk->icsk_rto);
    3731           0 :         info->tcpi_ato = jiffies_to_usecs(icsk->icsk_ack.ato);
    3732           0 :         info->tcpi_snd_mss = tp->mss_cache;
    3733           0 :         info->tcpi_rcv_mss = icsk->icsk_ack.rcv_mss;
    3734             : 
    3735           0 :         info->tcpi_unacked = tp->packets_out;
    3736           0 :         info->tcpi_sacked = tp->sacked_out;
    3737             : 
    3738           0 :         info->tcpi_lost = tp->lost_out;
    3739           0 :         info->tcpi_retrans = tp->retrans_out;
    3740             : 
    3741           0 :         now = tcp_jiffies32;
    3742           0 :         info->tcpi_last_data_sent = jiffies_to_msecs(now - tp->lsndtime);
    3743           0 :         info->tcpi_last_data_recv = jiffies_to_msecs(now - icsk->icsk_ack.lrcvtime);
    3744           0 :         info->tcpi_last_ack_recv = jiffies_to_msecs(now - tp->rcv_tstamp);
    3745             : 
    3746           0 :         info->tcpi_pmtu = icsk->icsk_pmtu_cookie;
    3747           0 :         info->tcpi_rcv_ssthresh = tp->rcv_ssthresh;
    3748           0 :         info->tcpi_rtt = tp->srtt_us >> 3;
    3749           0 :         info->tcpi_rttvar = tp->mdev_us >> 2;
    3750           0 :         info->tcpi_snd_ssthresh = tp->snd_ssthresh;
    3751           0 :         info->tcpi_advmss = tp->advmss;
    3752             : 
    3753           0 :         info->tcpi_rcv_rtt = tp->rcv_rtt_est.rtt_us >> 3;
    3754           0 :         info->tcpi_rcv_space = tp->rcvq_space.space;
    3755             : 
    3756           0 :         info->tcpi_total_retrans = tp->total_retrans;
    3757             : 
    3758           0 :         info->tcpi_bytes_acked = tp->bytes_acked;
    3759           0 :         info->tcpi_bytes_received = tp->bytes_received;
    3760           0 :         info->tcpi_notsent_bytes = max_t(int, 0, tp->write_seq - tp->snd_nxt);
    3761           0 :         tcp_get_info_chrono_stats(tp, info);
    3762             : 
    3763           0 :         info->tcpi_segs_out = tp->segs_out;
    3764           0 :         info->tcpi_segs_in = tp->segs_in;
    3765             : 
    3766           0 :         info->tcpi_min_rtt = tcp_min_rtt(tp);
    3767           0 :         info->tcpi_data_segs_in = tp->data_segs_in;
    3768           0 :         info->tcpi_data_segs_out = tp->data_segs_out;
    3769             : 
    3770           0 :         info->tcpi_delivery_rate_app_limited = tp->rate_app_limited ? 1 : 0;
    3771           0 :         rate64 = tcp_compute_delivery_rate(tp);
    3772           0 :         if (rate64)
    3773           0 :                 info->tcpi_delivery_rate = rate64;
    3774           0 :         info->tcpi_delivered = tp->delivered;
    3775           0 :         info->tcpi_delivered_ce = tp->delivered_ce;
    3776           0 :         info->tcpi_bytes_sent = tp->bytes_sent;
    3777           0 :         info->tcpi_bytes_retrans = tp->bytes_retrans;
    3778           0 :         info->tcpi_dsack_dups = tp->dsack_dups;
    3779           0 :         info->tcpi_reord_seen = tp->reord_seen;
    3780           0 :         info->tcpi_rcv_ooopack = tp->rcv_ooopack;
    3781           0 :         info->tcpi_snd_wnd = tp->snd_wnd;
    3782           0 :         info->tcpi_fastopen_client_fail = tp->fastopen_client_fail;
    3783           0 :         unlock_sock_fast(sk, slow);
    3784             : }
    3785             : EXPORT_SYMBOL_GPL(tcp_get_info);
    3786             : 
    3787           0 : static size_t tcp_opt_stats_get_size(void)
    3788             : {
    3789           0 :         return
    3790           0 :                 nla_total_size_64bit(sizeof(u64)) + /* TCP_NLA_BUSY */
    3791           0 :                 nla_total_size_64bit(sizeof(u64)) + /* TCP_NLA_RWND_LIMITED */
    3792           0 :                 nla_total_size_64bit(sizeof(u64)) + /* TCP_NLA_SNDBUF_LIMITED */
    3793           0 :                 nla_total_size_64bit(sizeof(u64)) + /* TCP_NLA_DATA_SEGS_OUT */
    3794           0 :                 nla_total_size_64bit(sizeof(u64)) + /* TCP_NLA_TOTAL_RETRANS */
    3795           0 :                 nla_total_size_64bit(sizeof(u64)) + /* TCP_NLA_PACING_RATE */
    3796           0 :                 nla_total_size_64bit(sizeof(u64)) + /* TCP_NLA_DELIVERY_RATE */
    3797           0 :                 nla_total_size(sizeof(u32)) + /* TCP_NLA_SND_CWND */
    3798           0 :                 nla_total_size(sizeof(u32)) + /* TCP_NLA_REORDERING */
    3799           0 :                 nla_total_size(sizeof(u32)) + /* TCP_NLA_MIN_RTT */
    3800           0 :                 nla_total_size(sizeof(u8)) + /* TCP_NLA_RECUR_RETRANS */
    3801           0 :                 nla_total_size(sizeof(u8)) + /* TCP_NLA_DELIVERY_RATE_APP_LMT */
    3802           0 :                 nla_total_size(sizeof(u32)) + /* TCP_NLA_SNDQ_SIZE */
    3803           0 :                 nla_total_size(sizeof(u8)) + /* TCP_NLA_CA_STATE */
    3804           0 :                 nla_total_size(sizeof(u32)) + /* TCP_NLA_SND_SSTHRESH */
    3805           0 :                 nla_total_size(sizeof(u32)) + /* TCP_NLA_DELIVERED */
    3806           0 :                 nla_total_size(sizeof(u32)) + /* TCP_NLA_DELIVERED_CE */
    3807           0 :                 nla_total_size_64bit(sizeof(u64)) + /* TCP_NLA_BYTES_SENT */
    3808           0 :                 nla_total_size_64bit(sizeof(u64)) + /* TCP_NLA_BYTES_RETRANS */
    3809           0 :                 nla_total_size(sizeof(u32)) + /* TCP_NLA_DSACK_DUPS */
    3810           0 :                 nla_total_size(sizeof(u32)) + /* TCP_NLA_REORD_SEEN */
    3811           0 :                 nla_total_size(sizeof(u32)) + /* TCP_NLA_SRTT */
    3812           0 :                 nla_total_size(sizeof(u16)) + /* TCP_NLA_TIMEOUT_REHASH */
    3813           0 :                 nla_total_size(sizeof(u32)) + /* TCP_NLA_BYTES_NOTSENT */
    3814           0 :                 nla_total_size_64bit(sizeof(u64)) + /* TCP_NLA_EDT */
    3815           0 :                 nla_total_size(sizeof(u8)) + /* TCP_NLA_TTL */
    3816             :                 0;
    3817             : }
    3818             : 
    3819             : /* Returns TTL or hop limit of an incoming packet from skb. */
    3820           0 : static u8 tcp_skb_ttl_or_hop_limit(const struct sk_buff *skb)
    3821             : {
    3822           0 :         if (skb->protocol == htons(ETH_P_IP))
    3823           0 :                 return ip_hdr(skb)->ttl;
    3824           0 :         else if (skb->protocol == htons(ETH_P_IPV6))
    3825           0 :                 return ipv6_hdr(skb)->hop_limit;
    3826             :         else
    3827             :                 return 0;
    3828             : }
    3829             : 
    3830           0 : struct sk_buff *tcp_get_timestamping_opt_stats(const struct sock *sk,
    3831             :                                                const struct sk_buff *orig_skb,
    3832             :                                                const struct sk_buff *ack_skb)
    3833             : {
    3834           0 :         const struct tcp_sock *tp = tcp_sk(sk);
    3835           0 :         struct sk_buff *stats;
    3836           0 :         struct tcp_info info;
    3837           0 :         unsigned long rate;
    3838           0 :         u64 rate64;
    3839             : 
    3840           0 :         stats = alloc_skb(tcp_opt_stats_get_size(), GFP_ATOMIC);
    3841           0 :         if (!stats)
    3842             :                 return NULL;
    3843             : 
    3844           0 :         tcp_get_info_chrono_stats(tp, &info);
    3845           0 :         nla_put_u64_64bit(stats, TCP_NLA_BUSY,
    3846             :                           info.tcpi_busy_time, TCP_NLA_PAD);
    3847           0 :         nla_put_u64_64bit(stats, TCP_NLA_RWND_LIMITED,
    3848             :                           info.tcpi_rwnd_limited, TCP_NLA_PAD);
    3849           0 :         nla_put_u64_64bit(stats, TCP_NLA_SNDBUF_LIMITED,
    3850             :                           info.tcpi_sndbuf_limited, TCP_NLA_PAD);
    3851           0 :         nla_put_u64_64bit(stats, TCP_NLA_DATA_SEGS_OUT,
    3852           0 :                           tp->data_segs_out, TCP_NLA_PAD);
    3853           0 :         nla_put_u64_64bit(stats, TCP_NLA_TOTAL_RETRANS,
    3854           0 :                           tp->total_retrans, TCP_NLA_PAD);
    3855             : 
    3856           0 :         rate = READ_ONCE(sk->sk_pacing_rate);
    3857           0 :         rate64 = (rate != ~0UL) ? rate : ~0ULL;
    3858           0 :         nla_put_u64_64bit(stats, TCP_NLA_PACING_RATE, rate64, TCP_NLA_PAD);
    3859             : 
    3860           0 :         rate64 = tcp_compute_delivery_rate(tp);
    3861           0 :         nla_put_u64_64bit(stats, TCP_NLA_DELIVERY_RATE, rate64, TCP_NLA_PAD);
    3862             : 
    3863           0 :         nla_put_u32(stats, TCP_NLA_SND_CWND, tp->snd_cwnd);
    3864           0 :         nla_put_u32(stats, TCP_NLA_REORDERING, tp->reordering);
    3865           0 :         nla_put_u32(stats, TCP_NLA_MIN_RTT, tcp_min_rtt(tp));
    3866             : 
    3867           0 :         nla_put_u8(stats, TCP_NLA_RECUR_RETRANS, inet_csk(sk)->icsk_retransmits);
    3868           0 :         nla_put_u8(stats, TCP_NLA_DELIVERY_RATE_APP_LMT, !!tp->rate_app_limited);
    3869           0 :         nla_put_u32(stats, TCP_NLA_SND_SSTHRESH, tp->snd_ssthresh);
    3870           0 :         nla_put_u32(stats, TCP_NLA_DELIVERED, tp->delivered);
    3871           0 :         nla_put_u32(stats, TCP_NLA_DELIVERED_CE, tp->delivered_ce);
    3872             : 
    3873           0 :         nla_put_u32(stats, TCP_NLA_SNDQ_SIZE, tp->write_seq - tp->snd_una);
    3874           0 :         nla_put_u8(stats, TCP_NLA_CA_STATE, inet_csk(sk)->icsk_ca_state);
    3875             : 
    3876           0 :         nla_put_u64_64bit(stats, TCP_NLA_BYTES_SENT, tp->bytes_sent,
    3877             :                           TCP_NLA_PAD);
    3878           0 :         nla_put_u64_64bit(stats, TCP_NLA_BYTES_RETRANS, tp->bytes_retrans,
    3879             :                           TCP_NLA_PAD);
    3880           0 :         nla_put_u32(stats, TCP_NLA_DSACK_DUPS, tp->dsack_dups);
    3881           0 :         nla_put_u32(stats, TCP_NLA_REORD_SEEN, tp->reord_seen);
    3882           0 :         nla_put_u32(stats, TCP_NLA_SRTT, tp->srtt_us >> 3);
    3883           0 :         nla_put_u16(stats, TCP_NLA_TIMEOUT_REHASH, tp->timeout_rehash);
    3884           0 :         nla_put_u32(stats, TCP_NLA_BYTES_NOTSENT,
    3885           0 :                     max_t(int, 0, tp->write_seq - tp->snd_nxt));
    3886           0 :         nla_put_u64_64bit(stats, TCP_NLA_EDT, orig_skb->skb_mstamp_ns,
    3887             :                           TCP_NLA_PAD);
    3888           0 :         if (ack_skb)
    3889           0 :                 nla_put_u8(stats, TCP_NLA_TTL,
    3890           0 :                            tcp_skb_ttl_or_hop_limit(ack_skb));
    3891             : 
    3892             :         return stats;
    3893             : }
    3894             : 
    3895           4 : static int do_tcp_getsockopt(struct sock *sk, int level,
    3896             :                 int optname, char __user *optval, int __user *optlen)
    3897             : {
    3898           4 :         struct inet_connection_sock *icsk = inet_csk(sk);
    3899           4 :         struct tcp_sock *tp = tcp_sk(sk);
    3900           4 :         struct net *net = sock_net(sk);
    3901           4 :         int val, len;
    3902             : 
    3903           4 :         if (get_user(len, optlen))
    3904             :                 return -EFAULT;
    3905             : 
    3906           4 :         len = min_t(unsigned int, len, sizeof(int));
    3907             : 
    3908           4 :         if (len < 0)
    3909             :                 return -EINVAL;
    3910             : 
    3911           4 :         switch (optname) {
    3912           0 :         case TCP_MAXSEG:
    3913           0 :                 val = tp->mss_cache;
    3914           0 :                 if (!val && ((1 << sk->sk_state) & (TCPF_CLOSE | TCPF_LISTEN)))
    3915           0 :                         val = tp->rx_opt.user_mss;
    3916           0 :                 if (tp->repair)
    3917           0 :                         val = tp->rx_opt.mss_clamp;
    3918             :                 break;
    3919           4 :         case TCP_NODELAY:
    3920           4 :                 val = !!(tp->nonagle&TCP_NAGLE_OFF);
    3921           4 :                 break;
    3922           0 :         case TCP_CORK:
    3923           0 :                 val = !!(tp->nonagle&TCP_NAGLE_CORK);
    3924           0 :                 break;
    3925             :         case TCP_KEEPIDLE:
    3926           0 :                 val = keepalive_time_when(tp) / HZ;
    3927           0 :                 break;
    3928             :         case TCP_KEEPINTVL:
    3929           0 :                 val = keepalive_intvl_when(tp) / HZ;
    3930           0 :                 break;
    3931             :         case TCP_KEEPCNT:
    3932           0 :                 val = keepalive_probes(tp);
    3933           0 :                 break;
    3934           0 :         case TCP_SYNCNT:
    3935           0 :                 val = icsk->icsk_syn_retries ? : net->ipv4.sysctl_tcp_syn_retries;
    3936           0 :                 break;
    3937           0 :         case TCP_LINGER2:
    3938           0 :                 val = tp->linger2;
    3939           0 :                 if (val >= 0)
    3940           0 :                         val = (val ? : net->ipv4.sysctl_tcp_fin_timeout) / HZ;
    3941             :                 break;
    3942           0 :         case TCP_DEFER_ACCEPT:
    3943           0 :                 val = retrans_to_secs(icsk->icsk_accept_queue.rskq_defer_accept,
    3944             :                                       TCP_TIMEOUT_INIT / HZ, TCP_RTO_MAX / HZ);
    3945           0 :                 break;
    3946           0 :         case TCP_WINDOW_CLAMP:
    3947           0 :                 val = tp->window_clamp;
    3948           0 :                 break;
    3949           0 :         case TCP_INFO: {
    3950           0 :                 struct tcp_info info;
    3951             : 
    3952           0 :                 if (get_user(len, optlen))
    3953             :                         return -EFAULT;
    3954             : 
    3955           0 :                 tcp_get_info(sk, &info);
    3956             : 
    3957           0 :                 len = min_t(unsigned int, len, sizeof(info));
    3958           0 :                 if (put_user(len, optlen))
    3959             :                         return -EFAULT;
    3960           0 :                 if (copy_to_user(optval, &info, len))
    3961           0 :                         return -EFAULT;
    3962             :                 return 0;
    3963             :         }
    3964           0 :         case TCP_CC_INFO: {
    3965           0 :                 const struct tcp_congestion_ops *ca_ops;
    3966           0 :                 union tcp_cc_info info;
    3967           0 :                 size_t sz = 0;
    3968           0 :                 int attr;
    3969             : 
    3970           0 :                 if (get_user(len, optlen))
    3971             :                         return -EFAULT;
    3972             : 
    3973           0 :                 ca_ops = icsk->icsk_ca_ops;
    3974           0 :                 if (ca_ops && ca_ops->get_info)
    3975           0 :                         sz = ca_ops->get_info(sk, ~0U, &attr, &info);
    3976             : 
    3977           0 :                 len = min_t(unsigned int, len, sz);
    3978           0 :                 if (put_user(len, optlen))
    3979             :                         return -EFAULT;
    3980           0 :                 if (copy_to_user(optval, &info, len))
    3981           0 :                         return -EFAULT;
    3982             :                 return 0;
    3983             :         }
    3984             :         case TCP_QUICKACK:
    3985           0 :                 val = !inet_csk_in_pingpong_mode(sk);
    3986           0 :                 break;
    3987             : 
    3988           0 :         case TCP_CONGESTION:
    3989           0 :                 if (get_user(len, optlen))
    3990             :                         return -EFAULT;
    3991           0 :                 len = min_t(unsigned int, len, TCP_CA_NAME_MAX);
    3992           0 :                 if (put_user(len, optlen))
    3993             :                         return -EFAULT;
    3994           0 :                 if (copy_to_user(optval, icsk->icsk_ca_ops->name, len))
    3995           0 :                         return -EFAULT;
    3996             :                 return 0;
    3997             : 
    3998           0 :         case TCP_ULP:
    3999           0 :                 if (get_user(len, optlen))
    4000             :                         return -EFAULT;
    4001           0 :                 len = min_t(unsigned int, len, TCP_ULP_NAME_MAX);
    4002           0 :                 if (!icsk->icsk_ulp_ops) {
    4003           0 :                         if (put_user(0, optlen))
    4004             :                                 return -EFAULT;
    4005           0 :                         return 0;
    4006             :                 }
    4007           0 :                 if (put_user(len, optlen))
    4008             :                         return -EFAULT;
    4009           0 :                 if (copy_to_user(optval, icsk->icsk_ulp_ops->name, len))
    4010           0 :                         return -EFAULT;
    4011             :                 return 0;
    4012             : 
    4013           0 :         case TCP_FASTOPEN_KEY: {
    4014           0 :                 u64 key[TCP_FASTOPEN_KEY_BUF_LENGTH / sizeof(u64)];
    4015           0 :                 unsigned int key_len;
    4016             : 
    4017           0 :                 if (get_user(len, optlen))
    4018             :                         return -EFAULT;
    4019             : 
    4020           0 :                 key_len = tcp_fastopen_get_cipher(net, icsk, key) *
    4021             :                                 TCP_FASTOPEN_KEY_LENGTH;
    4022           0 :                 len = min_t(unsigned int, len, key_len);
    4023           0 :                 if (put_user(len, optlen))
    4024             :                         return -EFAULT;
    4025           0 :                 if (copy_to_user(optval, key, len))
    4026           0 :                         return -EFAULT;
    4027             :                 return 0;
    4028             :         }
    4029           0 :         case TCP_THIN_LINEAR_TIMEOUTS:
    4030           0 :                 val = tp->thin_lto;
    4031           0 :                 break;
    4032             : 
    4033           0 :         case TCP_THIN_DUPACK:
    4034           0 :                 val = 0;
    4035           0 :                 break;
    4036             : 
    4037           0 :         case TCP_REPAIR:
    4038           0 :                 val = tp->repair;
    4039           0 :                 break;
    4040             : 
    4041           0 :         case TCP_REPAIR_QUEUE:
    4042           0 :                 if (tp->repair)
    4043           0 :                         val = tp->repair_queue;
    4044             :                 else
    4045             :                         return -EINVAL;
    4046           0 :                 break;
    4047             : 
    4048           0 :         case TCP_REPAIR_WINDOW: {
    4049           0 :                 struct tcp_repair_window opt;
    4050             : 
    4051           0 :                 if (get_user(len, optlen))
    4052             :                         return -EFAULT;
    4053             : 
    4054           0 :                 if (len != sizeof(opt))
    4055             :                         return -EINVAL;
    4056             : 
    4057           0 :                 if (!tp->repair)
    4058             :                         return -EPERM;
    4059             : 
    4060           0 :                 opt.snd_wl1     = tp->snd_wl1;
    4061           0 :                 opt.snd_wnd     = tp->snd_wnd;
    4062           0 :                 opt.max_window  = tp->max_window;
    4063           0 :                 opt.rcv_wnd     = tp->rcv_wnd;
    4064           0 :                 opt.rcv_wup     = tp->rcv_wup;
    4065             : 
    4066           0 :                 if (copy_to_user(optval, &opt, len))
    4067           0 :                         return -EFAULT;
    4068             :                 return 0;
    4069             :         }
    4070           0 :         case TCP_QUEUE_SEQ:
    4071           0 :                 if (tp->repair_queue == TCP_SEND_QUEUE)
    4072           0 :                         val = tp->write_seq;
    4073           0 :                 else if (tp->repair_queue == TCP_RECV_QUEUE)
    4074           0 :                         val = tp->rcv_nxt;
    4075             :                 else
    4076             :                         return -EINVAL;
    4077             :                 break;
    4078             : 
    4079           0 :         case TCP_USER_TIMEOUT:
    4080           0 :                 val = icsk->icsk_user_timeout;
    4081           0 :                 break;
    4082             : 
    4083           0 :         case TCP_FASTOPEN:
    4084           0 :                 val = icsk->icsk_accept_queue.fastopenq.max_qlen;
    4085           0 :                 break;
    4086             : 
    4087           0 :         case TCP_FASTOPEN_CONNECT:
    4088           0 :                 val = tp->fastopen_connect;
    4089           0 :                 break;
    4090             : 
    4091           0 :         case TCP_FASTOPEN_NO_COOKIE:
    4092           0 :                 val = tp->fastopen_no_cookie;
    4093           0 :                 break;
    4094             : 
    4095           0 :         case TCP_TX_DELAY:
    4096           0 :                 val = tp->tcp_tx_delay;
    4097           0 :                 break;
    4098             : 
    4099             :         case TCP_TIMESTAMP:
    4100           0 :                 val = tcp_time_stamp_raw() + tp->tsoffset;
    4101           0 :                 break;
    4102           0 :         case TCP_NOTSENT_LOWAT:
    4103           0 :                 val = tp->notsent_lowat;
    4104           0 :                 break;
    4105           0 :         case TCP_INQ:
    4106           0 :                 val = tp->recvmsg_inq;
    4107           0 :                 break;
    4108           0 :         case TCP_SAVE_SYN:
    4109           0 :                 val = tp->save_syn;
    4110           0 :                 break;
    4111           0 :         case TCP_SAVED_SYN: {
    4112           0 :                 if (get_user(len, optlen))
    4113             :                         return -EFAULT;
    4114             : 
    4115           0 :                 lock_sock(sk);
    4116           0 :                 if (tp->saved_syn) {
    4117           0 :                         if (len < tcp_saved_syn_len(tp->saved_syn)) {
    4118           0 :                                 if (put_user(tcp_saved_syn_len(tp->saved_syn),
    4119             :                                              optlen)) {
    4120           0 :                                         release_sock(sk);
    4121           0 :                                         return -EFAULT;
    4122             :                                 }
    4123           0 :                                 release_sock(sk);
    4124           0 :                                 return -EINVAL;
    4125             :                         }
    4126           0 :                         len = tcp_saved_syn_len(tp->saved_syn);
    4127           0 :                         if (put_user(len, optlen)) {
    4128           0 :                                 release_sock(sk);
    4129           0 :                                 return -EFAULT;
    4130             :                         }
    4131           0 :                         if (copy_to_user(optval, tp->saved_syn->data, len)) {
    4132           0 :                                 release_sock(sk);
    4133           0 :                                 return -EFAULT;
    4134             :                         }
    4135           0 :                         tcp_saved_syn_free(tp);
    4136           0 :                         release_sock(sk);
    4137             :                 } else {
    4138           0 :                         release_sock(sk);
    4139           0 :                         len = 0;
    4140           0 :                         if (put_user(len, optlen))
    4141           0 :                                 return -EFAULT;
    4142             :                 }
    4143             :                 return 0;
    4144             :         }
    4145             : #ifdef CONFIG_MMU
    4146           0 :         case TCP_ZEROCOPY_RECEIVE: {
    4147           0 :                 struct scm_timestamping_internal tss;
    4148           0 :                 struct tcp_zerocopy_receive zc = {};
    4149           0 :                 int err;
    4150             : 
    4151           0 :                 if (get_user(len, optlen))
    4152             :                         return -EFAULT;
    4153           0 :                 if (len < 0 ||
    4154             :                     len < offsetofend(struct tcp_zerocopy_receive, length))
    4155             :                         return -EINVAL;
    4156           0 :                 if (unlikely(len > sizeof(zc))) {
    4157           0 :                         err = check_zeroed_user(optval + sizeof(zc),
    4158             :                                                 len - sizeof(zc));
    4159           0 :                         if (err < 1)
    4160           0 :                                 return err == 0 ? -EINVAL : err;
    4161           0 :                         len = sizeof(zc);
    4162           0 :                         if (put_user(len, optlen))
    4163             :                                 return -EFAULT;
    4164             :                 }
    4165           0 :                 if (copy_from_user(&zc, optval, len))
    4166             :                         return -EFAULT;
    4167           0 :                 if (zc.reserved)
    4168             :                         return -EINVAL;
    4169           0 :                 if (zc.msg_flags &  ~(TCP_VALID_ZC_MSG_FLAGS))
    4170             :                         return -EINVAL;
    4171           0 :                 lock_sock(sk);
    4172           0 :                 err = tcp_zerocopy_receive(sk, &zc, &tss);
    4173           0 :                 err = BPF_CGROUP_RUN_PROG_GETSOCKOPT_KERN(sk, level, optname,
    4174             :                                                           &zc, &len, err);
    4175           0 :                 release_sock(sk);
    4176           0 :                 if (len >= offsetofend(struct tcp_zerocopy_receive, msg_flags))
    4177           0 :                         goto zerocopy_rcv_cmsg;
    4178           0 :                 switch (len) {
    4179             :                 case offsetofend(struct tcp_zerocopy_receive, msg_flags):
    4180             :                         goto zerocopy_rcv_cmsg;
    4181           0 :                 case offsetofend(struct tcp_zerocopy_receive, msg_controllen):
    4182             :                 case offsetofend(struct tcp_zerocopy_receive, msg_control):
    4183             :                 case offsetofend(struct tcp_zerocopy_receive, flags):
    4184             :                 case offsetofend(struct tcp_zerocopy_receive, copybuf_len):
    4185             :                 case offsetofend(struct tcp_zerocopy_receive, copybuf_address):
    4186             :                 case offsetofend(struct tcp_zerocopy_receive, err):
    4187           0 :                         goto zerocopy_rcv_sk_err;
    4188           0 :                 case offsetofend(struct tcp_zerocopy_receive, inq):
    4189           0 :                         goto zerocopy_rcv_inq;
    4190           0 :                 case offsetofend(struct tcp_zerocopy_receive, length):
    4191             :                 default:
    4192           0 :                         goto zerocopy_rcv_out;
    4193             :                 }
    4194           0 : zerocopy_rcv_cmsg:
    4195           0 :                 if (zc.msg_flags & TCP_CMSG_TS)
    4196           0 :                         tcp_zc_finalize_rx_tstamp(sk, &zc, &tss);
    4197             :                 else
    4198           0 :                         zc.msg_flags = 0;
    4199           0 : zerocopy_rcv_sk_err:
    4200           0 :                 if (!err)
    4201           0 :                         zc.err = sock_error(sk);
    4202           0 : zerocopy_rcv_inq:
    4203           0 :                 zc.inq = tcp_inq_hint(sk);
    4204           0 : zerocopy_rcv_out:
    4205           0 :                 if (!err && copy_to_user(optval, &zc, len))
    4206           0 :                         err = -EFAULT;
    4207             :                 return err;
    4208             :         }
    4209             : #endif
    4210             :         default:
    4211             :                 return -ENOPROTOOPT;
    4212             :         }
    4213             : 
    4214           4 :         if (put_user(len, optlen))
    4215             :                 return -EFAULT;
    4216           8 :         if (copy_to_user(optval, &val, len))
    4217           0 :                 return -EFAULT;
    4218             :         return 0;
    4219             : }
    4220             : 
    4221           0 : bool tcp_bpf_bypass_getsockopt(int level, int optname)
    4222             : {
    4223             :         /* TCP do_tcp_getsockopt has optimized getsockopt implementation
    4224             :          * to avoid extra socket lock for TCP_ZEROCOPY_RECEIVE.
    4225             :          */
    4226           0 :         if (level == SOL_TCP && optname == TCP_ZEROCOPY_RECEIVE)
    4227           0 :                 return true;
    4228             : 
    4229             :         return false;
    4230             : }
    4231             : EXPORT_SYMBOL(tcp_bpf_bypass_getsockopt);
    4232             : 
    4233           8 : int tcp_getsockopt(struct sock *sk, int level, int optname, char __user *optval,
    4234             :                    int __user *optlen)
    4235             : {
    4236           8 :         struct inet_connection_sock *icsk = inet_csk(sk);
    4237             : 
    4238           8 :         if (level != SOL_TCP)
    4239           4 :                 return icsk->icsk_af_ops->getsockopt(sk, level, optname,
    4240             :                                                      optval, optlen);
    4241           4 :         return do_tcp_getsockopt(sk, level, optname, optval, optlen);
    4242             : }
    4243             : EXPORT_SYMBOL(tcp_getsockopt);
    4244             : 
    4245             : #ifdef CONFIG_TCP_MD5SIG
    4246             : static DEFINE_PER_CPU(struct tcp_md5sig_pool, tcp_md5sig_pool);
    4247             : static DEFINE_MUTEX(tcp_md5sig_mutex);
    4248             : static bool tcp_md5sig_pool_populated = false;
    4249             : 
    4250             : static void __tcp_alloc_md5sig_pool(void)
    4251             : {
    4252             :         struct crypto_ahash *hash;
    4253             :         int cpu;
    4254             : 
    4255             :         hash = crypto_alloc_ahash("md5", 0, CRYPTO_ALG_ASYNC);
    4256             :         if (IS_ERR(hash))
    4257             :                 return;
    4258             : 
    4259             :         for_each_possible_cpu(cpu) {
    4260             :                 void *scratch = per_cpu(tcp_md5sig_pool, cpu).scratch;
    4261             :                 struct ahash_request *req;
    4262             : 
    4263             :                 if (!scratch) {
    4264             :                         scratch = kmalloc_node(sizeof(union tcp_md5sum_block) +
    4265             :                                                sizeof(struct tcphdr),
    4266             :                                                GFP_KERNEL,
    4267             :                                                cpu_to_node(cpu));
    4268             :                         if (!scratch)
    4269             :                                 return;
    4270             :                         per_cpu(tcp_md5sig_pool, cpu).scratch = scratch;
    4271             :                 }
    4272             :                 if (per_cpu(tcp_md5sig_pool, cpu).md5_req)
    4273             :                         continue;
    4274             : 
    4275             :                 req = ahash_request_alloc(hash, GFP_KERNEL);
    4276             :                 if (!req)
    4277             :                         return;
    4278             : 
    4279             :                 ahash_request_set_callback(req, 0, NULL, NULL);
    4280             : 
    4281             :                 per_cpu(tcp_md5sig_pool, cpu).md5_req = req;
    4282             :         }
    4283             :         /* before setting tcp_md5sig_pool_populated, we must commit all writes
    4284             :          * to memory. See smp_rmb() in tcp_get_md5sig_pool()
    4285             :          */
    4286             :         smp_wmb();
    4287             :         tcp_md5sig_pool_populated = true;
    4288             : }
    4289             : 
    4290             : bool tcp_alloc_md5sig_pool(void)
    4291             : {
    4292             :         if (unlikely(!tcp_md5sig_pool_populated)) {
    4293             :                 mutex_lock(&tcp_md5sig_mutex);
    4294             : 
    4295             :                 if (!tcp_md5sig_pool_populated) {
    4296             :                         __tcp_alloc_md5sig_pool();
    4297             :                         if (tcp_md5sig_pool_populated)
    4298             :                                 static_branch_inc(&tcp_md5_needed);
    4299             :                 }
    4300             : 
    4301             :                 mutex_unlock(&tcp_md5sig_mutex);
    4302             :         }
    4303             :         return tcp_md5sig_pool_populated;
    4304             : }
    4305             : EXPORT_SYMBOL(tcp_alloc_md5sig_pool);
    4306             : 
    4307             : 
    4308             : /**
    4309             :  *      tcp_get_md5sig_pool - get md5sig_pool for this user
    4310             :  *
    4311             :  *      We use percpu structure, so if we succeed, we exit with preemption
    4312             :  *      and BH disabled, to make sure another thread or softirq handling
    4313             :  *      wont try to get same context.
    4314             :  */
    4315             : struct tcp_md5sig_pool *tcp_get_md5sig_pool(void)
    4316             : {
    4317             :         local_bh_disable();
    4318             : 
    4319             :         if (tcp_md5sig_pool_populated) {
    4320             :                 /* coupled with smp_wmb() in __tcp_alloc_md5sig_pool() */
    4321             :                 smp_rmb();
    4322             :                 return this_cpu_ptr(&tcp_md5sig_pool);
    4323             :         }
    4324             :         local_bh_enable();
    4325             :         return NULL;
    4326             : }
    4327             : EXPORT_SYMBOL(tcp_get_md5sig_pool);
    4328             : 
    4329             : int tcp_md5_hash_skb_data(struct tcp_md5sig_pool *hp,
    4330             :                           const struct sk_buff *skb, unsigned int header_len)
    4331             : {
    4332             :         struct scatterlist sg;
    4333             :         const struct tcphdr *tp = tcp_hdr(skb);
    4334             :         struct ahash_request *req = hp->md5_req;
    4335             :         unsigned int i;
    4336             :         const unsigned int head_data_len = skb_headlen(skb) > header_len ?
    4337             :                                            skb_headlen(skb) - header_len : 0;
    4338             :         const struct skb_shared_info *shi = skb_shinfo(skb);
    4339             :         struct sk_buff *frag_iter;
    4340             : 
    4341             :         sg_init_table(&sg, 1);
    4342             : 
    4343             :         sg_set_buf(&sg, ((u8 *) tp) + header_len, head_data_len);
    4344             :         ahash_request_set_crypt(req, &sg, NULL, head_data_len);
    4345             :         if (crypto_ahash_update(req))
    4346             :                 return 1;
    4347             : 
    4348             :         for (i = 0; i < shi->nr_frags; ++i) {
    4349             :                 const skb_frag_t *f = &shi->frags[i];
    4350             :                 unsigned int offset = skb_frag_off(f);
    4351             :                 struct page *page = skb_frag_page(f) + (offset >> PAGE_SHIFT);
    4352             : 
    4353             :                 sg_set_page(&sg, page, skb_frag_size(f),
    4354             :                             offset_in_page(offset));
    4355             :                 ahash_request_set_crypt(req, &sg, NULL, skb_frag_size(f));
    4356             :                 if (crypto_ahash_update(req))
    4357             :                         return 1;
    4358             :         }
    4359             : 
    4360             :         skb_walk_frags(skb, frag_iter)
    4361             :                 if (tcp_md5_hash_skb_data(hp, frag_iter, 0))
    4362             :                         return 1;
    4363             : 
    4364             :         return 0;
    4365             : }
    4366             : EXPORT_SYMBOL(tcp_md5_hash_skb_data);
    4367             : 
    4368             : int tcp_md5_hash_key(struct tcp_md5sig_pool *hp, const struct tcp_md5sig_key *key)
    4369             : {
    4370             :         u8 keylen = READ_ONCE(key->keylen); /* paired with WRITE_ONCE() in tcp_md5_do_add */
    4371             :         struct scatterlist sg;
    4372             : 
    4373             :         sg_init_one(&sg, key->key, keylen);
    4374             :         ahash_request_set_crypt(hp->md5_req, &sg, NULL, keylen);
    4375             : 
    4376             :         /* We use data_race() because tcp_md5_do_add() might change key->key under us */
    4377             :         return data_race(crypto_ahash_update(hp->md5_req));
    4378             : }
    4379             : EXPORT_SYMBOL(tcp_md5_hash_key);
    4380             : 
    4381             : #endif
    4382             : 
    4383           3 : void tcp_done(struct sock *sk)
    4384             : {
    4385           3 :         struct request_sock *req;
    4386             : 
    4387             :         /* We might be called with a new socket, after
    4388             :          * inet_csk_prepare_forced_close() has been called
    4389             :          * so we can not use lockdep_sock_is_held(sk)
    4390             :          */
    4391           3 :         req = rcu_dereference_protected(tcp_sk(sk)->fastopen_rsk, 1);
    4392             : 
    4393           3 :         if (sk->sk_state == TCP_SYN_SENT || sk->sk_state == TCP_SYN_RECV)
    4394           3 :                 TCP_INC_STATS(sock_net(sk), TCP_MIB_ATTEMPTFAILS);
    4395             : 
    4396           3 :         tcp_set_state(sk, TCP_CLOSE);
    4397           3 :         tcp_clear_xmit_timers(sk);
    4398           3 :         if (req)
    4399           0 :                 reqsk_fastopen_remove(sk, req, false);
    4400             : 
    4401           3 :         sk->sk_shutdown = SHUTDOWN_MASK;
    4402             : 
    4403           3 :         if (!sock_flag(sk, SOCK_DEAD))
    4404           0 :                 sk->sk_state_change(sk);
    4405             :         else
    4406           3 :                 inet_csk_destroy_sock(sk);
    4407           3 : }
    4408             : EXPORT_SYMBOL_GPL(tcp_done);
    4409             : 
    4410           0 : int tcp_abort(struct sock *sk, int err)
    4411             : {
    4412           0 :         if (!sk_fullsock(sk)) {
    4413           0 :                 if (sk->sk_state == TCP_NEW_SYN_RECV) {
    4414           0 :                         struct request_sock *req = inet_reqsk(sk);
    4415             : 
    4416           0 :                         local_bh_disable();
    4417           0 :                         inet_csk_reqsk_queue_drop(req->rsk_listener, req);
    4418           0 :                         local_bh_enable();
    4419           0 :                         return 0;
    4420             :                 }
    4421             :                 return -EOPNOTSUPP;
    4422             :         }
    4423             : 
    4424             :         /* Don't race with userspace socket closes such as tcp_close. */
    4425           0 :         lock_sock(sk);
    4426             : 
    4427           0 :         if (sk->sk_state == TCP_LISTEN) {
    4428           0 :                 tcp_set_state(sk, TCP_CLOSE);
    4429           0 :                 inet_csk_listen_stop(sk);
    4430             :         }
    4431             : 
    4432             :         /* Don't race with BH socket closes such as inet_csk_listen_stop. */
    4433           0 :         local_bh_disable();
    4434           0 :         bh_lock_sock(sk);
    4435             : 
    4436           0 :         if (!sock_flag(sk, SOCK_DEAD)) {
    4437           0 :                 sk->sk_err = err;
    4438             :                 /* This barrier is coupled with smp_rmb() in tcp_poll() */
    4439           0 :                 smp_wmb();
    4440           0 :                 sk->sk_error_report(sk);
    4441           0 :                 if (tcp_need_reset(sk->sk_state))
    4442           0 :                         tcp_send_active_reset(sk, GFP_ATOMIC);
    4443           0 :                 tcp_done(sk);
    4444             :         }
    4445             : 
    4446           0 :         bh_unlock_sock(sk);
    4447           0 :         local_bh_enable();
    4448           0 :         tcp_write_queue_purge(sk);
    4449           0 :         release_sock(sk);
    4450           0 :         return 0;
    4451             : }
    4452             : EXPORT_SYMBOL_GPL(tcp_abort);
    4453             : 
    4454             : extern struct tcp_congestion_ops tcp_reno;
    4455             : 
    4456             : static __initdata unsigned long thash_entries;
    4457           0 : static int __init set_thash_entries(char *str)
    4458             : {
    4459           0 :         ssize_t ret;
    4460             : 
    4461           0 :         if (!str)
    4462             :                 return 0;
    4463             : 
    4464           0 :         ret = kstrtoul(str, 0, &thash_entries);
    4465           0 :         if (ret)
    4466           0 :                 return 0;
    4467             : 
    4468             :         return 1;
    4469             : }
    4470             : __setup("thash_entries=", set_thash_entries);
    4471             : 
    4472           1 : static void __init tcp_init_mem(void)
    4473             : {
    4474           1 :         unsigned long limit = nr_free_buffer_pages() / 16;
    4475             : 
    4476           1 :         limit = max(limit, 128UL);
    4477           1 :         sysctl_tcp_mem[0] = limit / 4 * 3;              /* 4.68 % */
    4478           1 :         sysctl_tcp_mem[1] = limit;                      /* 6.25 % */
    4479           1 :         sysctl_tcp_mem[2] = sysctl_tcp_mem[0] * 2;      /* 9.37 % */
    4480           1 : }
    4481             : 
    4482           1 : void __init tcp_init(void)
    4483             : {
    4484           1 :         int max_rshare, max_wshare, cnt;
    4485           1 :         unsigned long limit;
    4486           1 :         unsigned int i;
    4487             : 
    4488           1 :         BUILD_BUG_ON(TCP_MIN_SND_MSS <= MAX_TCP_OPTION_SPACE);
    4489           1 :         BUILD_BUG_ON(sizeof(struct tcp_skb_cb) >
    4490             :                      sizeof_field(struct sk_buff, cb));
    4491             : 
    4492           1 :         percpu_counter_init(&tcp_sockets_allocated, 0, GFP_KERNEL);
    4493           1 :         percpu_counter_init(&tcp_orphan_count, 0, GFP_KERNEL);
    4494           1 :         inet_hashinfo_init(&tcp_hashinfo);
    4495           1 :         inet_hashinfo2_init(&tcp_hashinfo, "tcp_listen_portaddr_hash",
    4496             :                             thash_entries, 21,  /* one slot per 2 MB*/
    4497             :                             0, 64 * 1024);
    4498           2 :         tcp_hashinfo.bind_bucket_cachep =
    4499           1 :                 kmem_cache_create("tcp_bind_bucket",
    4500             :                                   sizeof(struct inet_bind_bucket), 0,
    4501             :                                   SLAB_HWCACHE_ALIGN|SLAB_PANIC, NULL);
    4502             : 
    4503             :         /* Size and allocate the main established and bind bucket
    4504             :          * hash tables.
    4505             :          *
    4506             :          * The methodology is similar to that of the buffer cache.
    4507             :          */
    4508           2 :         tcp_hashinfo.ehash =
    4509           1 :                 alloc_large_system_hash("TCP established",
    4510             :                                         sizeof(struct inet_ehash_bucket),
    4511             :                                         thash_entries,
    4512             :                                         17, /* one slot per 128 KB of memory */
    4513             :                                         0,
    4514             :                                         NULL,
    4515             :                                         &tcp_hashinfo.ehash_mask,
    4516             :                                         0,
    4517           1 :                                         thash_entries ? 0 : 512 * 1024);
    4518        8193 :         for (i = 0; i <= tcp_hashinfo.ehash_mask; i++)
    4519        8192 :                 INIT_HLIST_NULLS_HEAD(&tcp_hashinfo.ehash[i].chain, i);
    4520             : 
    4521           1 :         if (inet_ehash_locks_alloc(&tcp_hashinfo))
    4522           0 :                 panic("TCP: failed to alloc ehash_locks");
    4523           2 :         tcp_hashinfo.bhash =
    4524           1 :                 alloc_large_system_hash("TCP bind",
    4525             :                                         sizeof(struct inet_bind_hashbucket),
    4526           1 :                                         tcp_hashinfo.ehash_mask + 1,
    4527             :                                         17, /* one slot per 128 KB of memory */
    4528             :                                         0,
    4529             :                                         &tcp_hashinfo.bhash_size,
    4530             :                                         NULL,
    4531             :                                         0,
    4532             :                                         64 * 1024);
    4533           1 :         tcp_hashinfo.bhash_size = 1U << tcp_hashinfo.bhash_size;
    4534        8193 :         for (i = 0; i < tcp_hashinfo.bhash_size; i++) {
    4535        8192 :                 spin_lock_init(&tcp_hashinfo.bhash[i].lock);
    4536        8192 :                 INIT_HLIST_HEAD(&tcp_hashinfo.bhash[i].chain);
    4537             :         }
    4538             : 
    4539             : 
    4540           1 :         cnt = tcp_hashinfo.ehash_mask + 1;
    4541           1 :         sysctl_tcp_max_orphans = cnt / 2;
    4542             : 
    4543           1 :         tcp_init_mem();
    4544             :         /* Set per-socket limits to no more than 1/128 the pressure threshold */
    4545           1 :         limit = nr_free_buffer_pages() << (PAGE_SHIFT - 7);
    4546           1 :         max_wshare = min(4UL*1024*1024, limit);
    4547           1 :         max_rshare = min(6UL*1024*1024, limit);
    4548             : 
    4549           1 :         init_net.ipv4.sysctl_tcp_wmem[0] = SK_MEM_QUANTUM;
    4550           1 :         init_net.ipv4.sysctl_tcp_wmem[1] = 16*1024;
    4551           1 :         init_net.ipv4.sysctl_tcp_wmem[2] = max(64*1024, max_wshare);
    4552             : 
    4553           1 :         init_net.ipv4.sysctl_tcp_rmem[0] = SK_MEM_QUANTUM;
    4554           1 :         init_net.ipv4.sysctl_tcp_rmem[1] = 131072;
    4555           1 :         init_net.ipv4.sysctl_tcp_rmem[2] = max(131072, max_rshare);
    4556             : 
    4557           1 :         pr_info("Hash tables configured (established %u bind %u)\n",
    4558             :                 tcp_hashinfo.ehash_mask + 1, tcp_hashinfo.bhash_size);
    4559             : 
    4560           1 :         tcp_v4_init();
    4561           1 :         tcp_metrics_init();
    4562           1 :         BUG_ON(tcp_register_congestion_control(&tcp_reno) != 0);
    4563           1 :         tcp_tasklet_init();
    4564           1 :         mptcp_init();
    4565           1 : }

Generated by: LCOV version 1.14