LCOV - landlock.info - net/ipv4/tcp

LCOV - code coverage report

Current view:	top level - net/ipv4 - tcp_ipv4.c (source / functions)		Hit	Total	Coverage
Test:	landlock.info	Lines:	388	1165	33.3 %
Date:	2021-04-22 12:43:58	Functions:	23	55	41.8 %

          Line data    Source code

       1             : // SPDX-License-Identifier: GPL-2.0-or-later
       2             : /*
       3             :  * INET         An implementation of the TCP/IP protocol suite for the LINUX
       4             :  *              operating system.  INET is implemented using the  BSD Socket
       5             :  *              interface as the means of communication with the user level.
       6             :  *
       7             :  *              Implementation of the Transmission Control Protocol(TCP).
       8             :  *
       9             :  *              IPv4 specific functions
      10             :  *
      11             :  *              code split from:
      12             :  *              linux/ipv4/tcp.c
      13             :  *              linux/ipv4/tcp_input.c
      14             :  *              linux/ipv4/tcp_output.c
      15             :  *
      16             :  *              See tcp.c for author information
      17             :  */
      18             : 
      19             : /*
      20             :  * Changes:
      21             :  *              David S. Miller :       New socket lookup architecture.
      22             :  *                                      This code is dedicated to John Dyson.
      23             :  *              David S. Miller :       Change semantics of established hash,
      24             :  *                                      half is devoted to TIME_WAIT sockets
      25             :  *                                      and the rest go in the other half.
      26             :  *              Andi Kleen :            Add support for syncookies and fixed
      27             :  *                                      some bugs: ip options weren't passed to
      28             :  *                                      the TCP layer, missed a check for an
      29             :  *                                      ACK bit.
      30             :  *              Andi Kleen :            Implemented fast path mtu discovery.
      31             :  *                                      Fixed many serious bugs in the
      32             :  *                                      request_sock handling and moved
      33             :  *                                      most of it into the af independent code.
      34             :  *                                      Added tail drop and some other bugfixes.
      35             :  *                                      Added new listen semantics.
      36             :  *              Mike McLagan    :       Routing by source
      37             :  *      Juan Jose Ciarlante:            ip_dynaddr bits
      38             :  *              Andi Kleen:             various fixes.
      39             :  *      Vitaly E. Lavrov        :       Transparent proxy revived after year
      40             :  *                                      coma.
      41             :  *      Andi Kleen              :       Fix new listen.
      42             :  *      Andi Kleen              :       Fix accept error reporting.
      43             :  *      YOSHIFUJI Hideaki @USAGI and:   Support IPV6_V6ONLY socket option, which
      44             :  *      Alexey Kuznetsov                allow both IPv4 and IPv6 sockets to bind
      45             :  *                                      a single port at the same time.
      46             :  */
      47             : 
      48             : #define pr_fmt(fmt) "TCP: " fmt
      49             : 
      50             : #include <linux/bottom_half.h>
      51             : #include <linux/types.h>
      52             : #include <linux/fcntl.h>
      53             : #include <linux/module.h>
      54             : #include <linux/random.h>
      55             : #include <linux/cache.h>
      56             : #include <linux/jhash.h>
      57             : #include <linux/init.h>
      58             : #include <linux/times.h>
      59             : #include <linux/slab.h>
      60             : 
      61             : #include <net/net_namespace.h>
      62             : #include <net/icmp.h>
      63             : #include <net/inet_hashtables.h>
      64             : #include <net/tcp.h>
      65             : #include <net/transp_v6.h>
      66             : #include <net/ipv6.h>
      67             : #include <net/inet_common.h>
      68             : #include <net/timewait_sock.h>
      69             : #include <net/xfrm.h>
      70             : #include <net/secure_seq.h>
      71             : #include <net/busy_poll.h>
      72             : 
      73             : #include <linux/inet.h>
      74             : #include <linux/ipv6.h>
      75             : #include <linux/stddef.h>
      76             : #include <linux/proc_fs.h>
      77             : #include <linux/seq_file.h>
      78             : #include <linux/inetdevice.h>
      79             : #include <linux/btf_ids.h>
      80             : 
      81             : #include <crypto/hash.h>
      82             : #include <linux/scatterlist.h>
      83             : 
      84             : #include <trace/events/tcp.h>
      85             : 
      86             : #ifdef CONFIG_TCP_MD5SIG
      87             : static int tcp_v4_md5_hash_hdr(char *md5_hash, const struct tcp_md5sig_key *key,
      88             :                                __be32 daddr, __be32 saddr, const struct tcphdr *th);
      89             : #endif
      90             : 
      91             : struct inet_hashinfo tcp_hashinfo;
      92             : EXPORT_SYMBOL(tcp_hashinfo);
      93             : 
      94           4 : static u32 tcp_v4_init_seq(const struct sk_buff *skb)
      95             : {
      96           4 :         return secure_tcp_seq(ip_hdr(skb)->daddr,
      97           4 :                               ip_hdr(skb)->saddr,
      98           4 :                               tcp_hdr(skb)->dest,
      99           4 :                               tcp_hdr(skb)->source);
     100             : }
     101             : 
     102           0 : static u32 tcp_v4_init_ts_off(const struct net *net, const struct sk_buff *skb)
     103             : {
     104           0 :         return secure_tcp_ts_off(net, ip_hdr(skb)->daddr, ip_hdr(skb)->saddr);
     105             : }
     106             : 
     107           0 : int tcp_twsk_unique(struct sock *sk, struct sock *sktw, void *twp)
     108             : {
     109           0 :         const struct inet_timewait_sock *tw = inet_twsk(sktw);
     110           0 :         const struct tcp_timewait_sock *tcptw = tcp_twsk(sktw);
     111           0 :         struct tcp_sock *tp = tcp_sk(sk);
     112           0 :         int reuse = sock_net(sk)->ipv4.sysctl_tcp_tw_reuse;
     113             : 
     114           0 :         if (reuse == 2) {
     115             :                 /* Still does not detect *everything* that goes through
     116             :                  * lo, since we require a loopback src or dst address
     117             :                  * or direct binding to 'lo' interface.
     118             :                  */
     119           0 :                 bool loopback = false;
     120           0 :                 if (tw->tw_bound_dev_if == LOOPBACK_IFINDEX)
     121           0 :                         loopback = true;
     122             : #if IS_ENABLED(CONFIG_IPV6)
     123             :                 if (tw->tw_family == AF_INET6) {
     124             :                         if (ipv6_addr_loopback(&tw->tw_v6_daddr) ||
     125             :                             ipv6_addr_v4mapped_loopback(&tw->tw_v6_daddr) ||
     126             :                             ipv6_addr_loopback(&tw->tw_v6_rcv_saddr) ||
     127             :                             ipv6_addr_v4mapped_loopback(&tw->tw_v6_rcv_saddr))
     128             :                                 loopback = true;
     129             :                 } else
     130             : #endif
     131             :                 {
     132           0 :                         if (ipv4_is_loopback(tw->tw_daddr) ||
     133           0 :                             ipv4_is_loopback(tw->tw_rcv_saddr))
     134             :                                 loopback = true;
     135             :                 }
     136           0 :                 if (!loopback)
     137           0 :                         reuse = 0;
     138             :         }
     139             : 
     140             :         /* With PAWS, it is safe from the viewpoint
     141             :            of data integrity. Even without PAWS it is safe provided sequence
     142             :            spaces do not overlap i.e. at data rates <= 80Mbit/sec.
     143             : 
     144             :            Actually, the idea is close to VJ's one, only timestamp cache is
     145             :            held not per host, but per port pair and TW bucket is used as state
     146             :            holder.
     147             : 
     148             :            If TW bucket has been already destroyed we fall back to VJ's scheme
     149             :            and use initial timestamp retrieved from peer table.
     150             :          */
     151           0 :         if (tcptw->tw_ts_recent_stamp &&
     152           0 :             (!twp || (reuse && time_after32(ktime_get_seconds(),
     153             :                                             tcptw->tw_ts_recent_stamp)))) {
     154             :                 /* In case of repair and re-using TIME-WAIT sockets we still
     155             :                  * want to be sure that it is safe as above but honor the
     156             :                  * sequence numbers and time stamps set as part of the repair
     157             :                  * process.
     158             :                  *
     159             :                  * Without this check re-using a TIME-WAIT socket with TCP
     160             :                  * repair would accumulate a -1 on the repair assigned
     161             :                  * sequence number. The first time it is reused the sequence
     162             :                  * is -1, the second time -2, etc. This fixes that issue
     163             :                  * without appearing to create any others.
     164             :                  */
     165           0 :                 if (likely(!tp->repair)) {
     166           0 :                         u32 seq = tcptw->tw_snd_nxt + 65535 + 2;
     167             : 
     168           0 :                         if (!seq)
     169             :                                 seq = 1;
     170           0 :                         WRITE_ONCE(tp->write_seq, seq);
     171           0 :                         tp->rx_opt.ts_recent    = tcptw->tw_ts_recent;
     172           0 :                         tp->rx_opt.ts_recent_stamp = tcptw->tw_ts_recent_stamp;
     173             :                 }
     174           0 :                 sock_hold(sktw);
     175           0 :                 return 1;
     176             :         }
     177             : 
     178             :         return 0;
     179             : }
     180             : EXPORT_SYMBOL_GPL(tcp_twsk_unique);
     181             : 
     182           0 : static int tcp_v4_pre_connect(struct sock *sk, struct sockaddr *uaddr,
     183             :                               int addr_len)
     184             : {
     185             :         /* This check is replicated from tcp_v4_connect() and intended to
     186             :          * prevent BPF program called below from accessing bytes that are out
     187             :          * of the bound specified by user in addr_len.
     188             :          */
     189           0 :         if (addr_len < sizeof(struct sockaddr_in))
     190             :                 return -EINVAL;
     191             : 
     192           0 :         sock_owned_by_me(sk);
     193             : 
     194           0 :         return BPF_CGROUP_RUN_PROG_INET4_CONNECT(sk, uaddr);
     195             : }
     196             : 
     197             : /* This will initiate an outgoing connection. */
     198           0 : int tcp_v4_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len)
     199             : {
     200           0 :         struct sockaddr_in *usin = (struct sockaddr_in *)uaddr;
     201           0 :         struct inet_sock *inet = inet_sk(sk);
     202           0 :         struct tcp_sock *tp = tcp_sk(sk);
     203           0 :         __be16 orig_sport, orig_dport;
     204           0 :         __be32 daddr, nexthop;
     205           0 :         struct flowi4 *fl4;
     206           0 :         struct rtable *rt;
     207           0 :         int err;
     208           0 :         struct ip_options_rcu *inet_opt;
     209           0 :         struct inet_timewait_death_row *tcp_death_row = &sock_net(sk)->ipv4.tcp_death_row;
     210             : 
     211           0 :         if (addr_len < sizeof(struct sockaddr_in))
     212             :                 return -EINVAL;
     213             : 
     214           0 :         if (usin->sin_family != AF_INET)
     215             :                 return -EAFNOSUPPORT;
     216             : 
     217           0 :         nexthop = daddr = usin->sin_addr.s_addr;
     218           0 :         inet_opt = rcu_dereference_protected(inet->inet_opt,
     219             :                                              lockdep_sock_is_held(sk));
     220           0 :         if (inet_opt && inet_opt->opt.srr) {
     221           0 :                 if (!daddr)
     222             :                         return -EINVAL;
     223           0 :                 nexthop = inet_opt->opt.faddr;
     224             :         }
     225             : 
     226           0 :         orig_sport = inet->inet_sport;
     227           0 :         orig_dport = usin->sin_port;
     228           0 :         fl4 = &inet->cork.fl.u.ip4;
     229           0 :         rt = ip_route_connect(fl4, nexthop, inet->inet_saddr,
     230           0 :                               RT_CONN_FLAGS(sk), sk->sk_bound_dev_if,
     231             :                               IPPROTO_TCP,
     232             :                               orig_sport, orig_dport, sk);
     233           0 :         if (IS_ERR(rt)) {
     234           0 :                 err = PTR_ERR(rt);
     235           0 :                 if (err == -ENETUNREACH)
     236           0 :                         IP_INC_STATS(sock_net(sk), IPSTATS_MIB_OUTNOROUTES);
     237           0 :                 return err;
     238             :         }
     239             : 
     240           0 :         if (rt->rt_flags & (RTCF_MULTICAST | RTCF_BROADCAST)) {
     241           0 :                 ip_rt_put(rt);
     242           0 :                 return -ENETUNREACH;
     243             :         }
     244             : 
     245           0 :         if (!inet_opt || !inet_opt->opt.srr)
     246           0 :                 daddr = fl4->daddr;
     247             : 
     248           0 :         if (!inet->inet_saddr)
     249           0 :                 inet->inet_saddr = fl4->saddr;
     250           0 :         sk_rcv_saddr_set(sk, inet->inet_saddr);
     251             : 
     252           0 :         if (tp->rx_opt.ts_recent_stamp && inet->inet_daddr != daddr) {
     253             :                 /* Reset inherited state */
     254           0 :                 tp->rx_opt.ts_recent    = 0;
     255           0 :                 tp->rx_opt.ts_recent_stamp = 0;
     256           0 :                 if (likely(!tp->repair))
     257           0 :                         WRITE_ONCE(tp->write_seq, 0);
     258             :         }
     259             : 
     260           0 :         inet->inet_dport = usin->sin_port;
     261           0 :         sk_daddr_set(sk, daddr);
     262             : 
     263           0 :         inet_csk(sk)->icsk_ext_hdr_len = 0;
     264           0 :         if (inet_opt)
     265           0 :                 inet_csk(sk)->icsk_ext_hdr_len = inet_opt->opt.optlen;
     266             : 
     267           0 :         tp->rx_opt.mss_clamp = TCP_MSS_DEFAULT;
     268             : 
     269             :         /* Socket identity is still unknown (sport may be zero).
     270             :          * However we set state to SYN-SENT and not releasing socket
     271             :          * lock select source port, enter ourselves into the hash tables and
     272             :          * complete initialization after this.
     273             :          */
     274           0 :         tcp_set_state(sk, TCP_SYN_SENT);
     275           0 :         err = inet_hash_connect(tcp_death_row, sk);
     276           0 :         if (err)
     277           0 :                 goto failure;
     278             : 
     279           0 :         sk_set_txhash(sk);
     280             : 
     281           0 :         rt = ip_route_newports(fl4, rt, orig_sport, orig_dport,
     282           0 :                                inet->inet_sport, inet->inet_dport, sk);
     283           0 :         if (IS_ERR(rt)) {
     284           0 :                 err = PTR_ERR(rt);
     285           0 :                 rt = NULL;
     286           0 :                 goto failure;
     287             :         }
     288             :         /* OK, now commit destination to socket.  */
     289           0 :         sk->sk_gso_type = SKB_GSO_TCPV4;
     290           0 :         sk_setup_caps(sk, &rt->dst);
     291           0 :         rt = NULL;
     292             : 
     293           0 :         if (likely(!tp->repair)) {
     294           0 :                 if (!tp->write_seq)
     295           0 :                         WRITE_ONCE(tp->write_seq,
     296             :                                    secure_tcp_seq(inet->inet_saddr,
     297             :                                                   inet->inet_daddr,
     298             :                                                   inet->inet_sport,
     299             :                                                   usin->sin_port));
     300           0 :                 tp->tsoffset = secure_tcp_ts_off(sock_net(sk),
     301             :                                                  inet->inet_saddr,
     302             :                                                  inet->inet_daddr);
     303             :         }
     304             : 
     305           0 :         inet->inet_id = prandom_u32();
     306             : 
     307           0 :         if (tcp_fastopen_defer_connect(sk, &err))
     308           0 :                 return err;
     309           0 :         if (err)
     310           0 :                 goto failure;
     311             : 
     312           0 :         err = tcp_connect(sk);
     313             : 
     314           0 :         if (err)
     315           0 :                 goto failure;
     316             : 
     317             :         return 0;
     318             : 
     319           0 : failure:
     320             :         /*
     321             :          * This unhashes the socket and releases the local port,
     322             :          * if necessary.
     323             :          */
     324           0 :         tcp_set_state(sk, TCP_CLOSE);
     325           0 :         ip_rt_put(rt);
     326           0 :         sk->sk_route_caps = 0;
     327           0 :         inet->inet_dport = 0;
     328           0 :         return err;
     329             : }
     330             : EXPORT_SYMBOL(tcp_v4_connect);
     331             : 
     332             : /*
     333             :  * This routine reacts to ICMP_FRAG_NEEDED mtu indications as defined in RFC1191.
     334             :  * It can be called through tcp_release_cb() if socket was owned by user
     335             :  * at the time tcp_v4_err() was called to handle ICMP message.
     336             :  */
     337           0 : void tcp_v4_mtu_reduced(struct sock *sk)
     338             : {
     339           0 :         struct inet_sock *inet = inet_sk(sk);
     340           0 :         struct dst_entry *dst;
     341           0 :         u32 mtu;
     342             : 
     343           0 :         if ((1 << sk->sk_state) & (TCPF_LISTEN | TCPF_CLOSE))
     344             :                 return;
     345           0 :         mtu = tcp_sk(sk)->mtu_info;
     346           0 :         dst = inet_csk_update_pmtu(sk, mtu);
     347           0 :         if (!dst)
     348             :                 return;
     349             : 
     350             :         /* Something is about to be wrong... Remember soft error
     351             :          * for the case, if this connection will not able to recover.
     352             :          */
     353           0 :         if (mtu < dst_mtu(dst) && ip_dont_fragment(sk, dst))
     354           0 :                 sk->sk_err_soft = EMSGSIZE;
     355             : 
     356           0 :         mtu = dst_mtu(dst);
     357             : 
     358           0 :         if (inet->pmtudisc != IP_PMTUDISC_DONT &&
     359           0 :             ip_sk_accept_pmtu(sk) &&
     360           0 :             inet_csk(sk)->icsk_pmtu_cookie > mtu) {
     361           0 :                 tcp_sync_mss(sk, mtu);
     362             : 
     363             :                 /* Resend the TCP packet because it's
     364             :                  * clear that the old packet has been
     365             :                  * dropped. This is the new "fast" path mtu
     366             :                  * discovery.
     367             :                  */
     368           0 :                 tcp_simple_retransmit(sk);
     369             :         } /* else let the usual retransmit timer handle it */
     370             : }
     371             : EXPORT_SYMBOL(tcp_v4_mtu_reduced);
     372             : 
     373           0 : static void do_redirect(struct sk_buff *skb, struct sock *sk)
     374             : {
     375           0 :         struct dst_entry *dst = __sk_dst_check(sk, 0);
     376             : 
     377           0 :         if (dst)
     378           0 :                 dst->ops->redirect(dst, sk, skb);
     379           0 : }
     380             : 
     381             : 
     382             : /* handle ICMP messages on TCP_NEW_SYN_RECV request sockets */
     383           0 : void tcp_req_err(struct sock *sk, u32 seq, bool abort)
     384             : {
     385           0 :         struct request_sock *req = inet_reqsk(sk);
     386           0 :         struct net *net = sock_net(sk);
     387             : 
     388             :         /* ICMPs are not backlogged, hence we cannot get
     389             :          * an established socket here.
     390             :          */
     391           0 :         if (seq != tcp_rsk(req)->snt_isn) {
     392           0 :                 __NET_INC_STATS(net, LINUX_MIB_OUTOFWINDOWICMPS);
     393           0 :         } else if (abort) {
     394             :                 /*
     395             :                  * Still in SYN_RECV, just remove it silently.
     396             :                  * There is no good way to pass the error to the newly
     397             :                  * created socket, and POSIX does not want network
     398             :                  * errors returned from accept().
     399             :                  */
     400           0 :                 inet_csk_reqsk_queue_drop(req->rsk_listener, req);
     401           0 :                 tcp_listendrop(req->rsk_listener);
     402             :         }
     403           0 :         reqsk_put(req);
     404           0 : }
     405             : EXPORT_SYMBOL(tcp_req_err);
     406             : 
     407             : /* TCP-LD (RFC 6069) logic */
     408           0 : void tcp_ld_RTO_revert(struct sock *sk, u32 seq)
     409             : {
     410           0 :         struct inet_connection_sock *icsk = inet_csk(sk);
     411           0 :         struct tcp_sock *tp = tcp_sk(sk);
     412           0 :         struct sk_buff *skb;
     413           0 :         s32 remaining;
     414           0 :         u32 delta_us;
     415             : 
     416           0 :         if (sock_owned_by_user(sk))
     417             :                 return;
     418             : 
     419           0 :         if (seq != tp->snd_una  || !icsk->icsk_retransmits ||
     420           0 :             !icsk->icsk_backoff)
     421             :                 return;
     422             : 
     423           0 :         skb = tcp_rtx_queue_head(sk);
     424           0 :         if (WARN_ON_ONCE(!skb))
     425             :                 return;
     426             : 
     427           0 :         icsk->icsk_backoff--;
     428           0 :         icsk->icsk_rto = tp->srtt_us ? __tcp_set_rto(tp) : TCP_TIMEOUT_INIT;
     429           0 :         icsk->icsk_rto = inet_csk_rto_backoff(icsk, TCP_RTO_MAX);
     430             : 
     431           0 :         tcp_mstamp_refresh(tp);
     432           0 :         delta_us = (u32)(tp->tcp_mstamp - tcp_skb_timestamp_us(skb));
     433           0 :         remaining = icsk->icsk_rto - usecs_to_jiffies(delta_us);
     434             : 
     435           0 :         if (remaining > 0) {
     436           0 :                 inet_csk_reset_xmit_timer(sk, ICSK_TIME_RETRANS,
     437             :                                           remaining, TCP_RTO_MAX);
     438             :         } else {
     439             :                 /* RTO revert clocked out retransmission.
     440             :                  * Will retransmit now.
     441             :                  */
     442           0 :                 tcp_retransmit_timer(sk);
     443             :         }
     444             : }
     445             : EXPORT_SYMBOL(tcp_ld_RTO_revert);
     446             : 
     447             : /*
     448             :  * This routine is called by the ICMP module when it gets some
     449             :  * sort of error condition.  If err < 0 then the socket should
     450             :  * be closed and the error returned to the user.  If err > 0
     451             :  * it's just the icmp type << 8 | icmp code.  After adjustment
     452             :  * header points to the first 8 bytes of the tcp header.  We need
     453             :  * to find the appropriate port.
     454             :  *
     455             :  * The locking strategy used here is very "optimistic". When
     456             :  * someone else accesses the socket the ICMP is just dropped
     457             :  * and for some paths there is no check at all.
     458             :  * A more general error queue to queue errors for later handling
     459             :  * is probably better.
     460             :  *
     461             :  */
     462             : 
     463           0 : int tcp_v4_err(struct sk_buff *skb, u32 info)
     464             : {
     465           0 :         const struct iphdr *iph = (const struct iphdr *)skb->data;
     466           0 :         struct tcphdr *th = (struct tcphdr *)(skb->data + (iph->ihl << 2));
     467           0 :         struct tcp_sock *tp;
     468           0 :         struct inet_sock *inet;
     469           0 :         const int type = icmp_hdr(skb)->type;
     470           0 :         const int code = icmp_hdr(skb)->code;
     471           0 :         struct sock *sk;
     472           0 :         struct request_sock *fastopen;
     473           0 :         u32 seq, snd_una;
     474           0 :         int err;
     475           0 :         struct net *net = dev_net(skb->dev);
     476             : 
     477           0 :         sk = __inet_lookup_established(net, &tcp_hashinfo, iph->daddr,
     478           0 :                                        th->dest, iph->saddr, ntohs(th->source),
     479             :                                        inet_iif(skb), 0);
     480           0 :         if (!sk) {
     481           0 :                 __ICMP_INC_STATS(net, ICMP_MIB_INERRORS);
     482           0 :                 return -ENOENT;
     483             :         }
     484           0 :         if (sk->sk_state == TCP_TIME_WAIT) {
     485           0 :                 inet_twsk_put(inet_twsk(sk));
     486           0 :                 return 0;
     487             :         }
     488           0 :         seq = ntohl(th->seq);
     489           0 :         if (sk->sk_state == TCP_NEW_SYN_RECV) {
     490           0 :                 tcp_req_err(sk, seq, type == ICMP_PARAMETERPROB ||
     491           0 :                                      type == ICMP_TIME_EXCEEDED ||
     492           0 :                                      (type == ICMP_DEST_UNREACH &&
     493           0 :                                       (code == ICMP_NET_UNREACH ||
     494             :                                        code == ICMP_HOST_UNREACH)));
     495           0 :                 return 0;
     496             :         }
     497             : 
     498           0 :         bh_lock_sock(sk);
     499             :         /* If too many ICMPs get dropped on busy
     500             :          * servers this needs to be solved differently.
     501             :          * We do take care of PMTU discovery (RFC1191) special case :
     502             :          * we can receive locally generated ICMP messages while socket is held.
     503             :          */
     504           0 :         if (sock_owned_by_user(sk)) {
     505           0 :                 if (!(type == ICMP_DEST_UNREACH && code == ICMP_FRAG_NEEDED))
     506           0 :                         __NET_INC_STATS(net, LINUX_MIB_LOCKDROPPEDICMPS);
     507             :         }
     508           0 :         if (sk->sk_state == TCP_CLOSE)
     509           0 :                 goto out;
     510             : 
     511           0 :         if (unlikely(iph->ttl < inet_sk(sk)->min_ttl)) {
     512           0 :                 __NET_INC_STATS(net, LINUX_MIB_TCPMINTTLDROP);
     513           0 :                 goto out;
     514             :         }
     515             : 
     516           0 :         tp = tcp_sk(sk);
     517             :         /* XXX (TFO) - tp->snd_una should be ISN (tcp_create_openreq_child() */
     518           0 :         fastopen = rcu_dereference(tp->fastopen_rsk);
     519           0 :         snd_una = fastopen ? tcp_rsk(fastopen)->snt_isn : tp->snd_una;
     520           0 :         if (sk->sk_state != TCP_LISTEN &&
     521           0 :             !between(seq, snd_una, tp->snd_nxt)) {
     522           0 :                 __NET_INC_STATS(net, LINUX_MIB_OUTOFWINDOWICMPS);
     523           0 :                 goto out;
     524             :         }
     525             : 
     526           0 :         switch (type) {
     527             :         case ICMP_REDIRECT:
     528           0 :                 if (!sock_owned_by_user(sk))
     529           0 :                         do_redirect(skb, sk);
     530           0 :                 goto out;
     531           0 :         case ICMP_SOURCE_QUENCH:
     532             :                 /* Just silently ignore these. */
     533           0 :                 goto out;
     534             :         case ICMP_PARAMETERPROB:
     535             :                 err = EPROTO;
     536             :                 break;
     537           0 :         case ICMP_DEST_UNREACH:
     538           0 :                 if (code > NR_ICMP_UNREACH)
     539           0 :                         goto out;
     540             : 
     541           0 :                 if (code == ICMP_FRAG_NEEDED) { /* PMTU discovery (RFC1191) */
     542             :                         /* We are not interested in TCP_LISTEN and open_requests
     543             :                          * (SYN-ACKs send out by Linux are always <576bytes so
     544             :                          * they should go through unfragmented).
     545             :                          */
     546           0 :                         if (sk->sk_state == TCP_LISTEN)
     547           0 :                                 goto out;
     548             : 
     549           0 :                         tp->mtu_info = info;
     550           0 :                         if (!sock_owned_by_user(sk)) {
     551           0 :                                 tcp_v4_mtu_reduced(sk);
     552             :                         } else {
     553           0 :                                 if (!test_and_set_bit(TCP_MTU_REDUCED_DEFERRED, &sk->sk_tsq_flags))
     554           0 :                                         sock_hold(sk);
     555             :                         }
     556           0 :                         goto out;
     557             :                 }
     558             : 
     559           0 :                 err = icmp_err_convert[code].errno;
     560             :                 /* check if this ICMP message allows revert of backoff.
     561             :                  * (see RFC 6069)
     562             :                  */
     563           0 :                 if (!fastopen &&
     564           0 :                     (code == ICMP_NET_UNREACH || code == ICMP_HOST_UNREACH))
     565           0 :                         tcp_ld_RTO_revert(sk, seq);
     566             :                 break;
     567           0 :         case ICMP_TIME_EXCEEDED:
     568           0 :                 err = EHOSTUNREACH;
     569           0 :                 break;
     570           0 :         default:
     571           0 :                 goto out;
     572             :         }
     573             : 
     574           0 :         switch (sk->sk_state) {
     575           0 :         case TCP_SYN_SENT:
     576             :         case TCP_SYN_RECV:
     577             :                 /* Only in fast or simultaneous open. If a fast open socket is
     578             :                  * already accepted it is treated as a connected one below.
     579             :                  */
     580           0 :                 if (fastopen && !fastopen->sk)
     581             :                         break;
     582             : 
     583           0 :                 ip_icmp_error(sk, skb, err, th->dest, info, (u8 *)th);
     584             : 
     585           0 :                 if (!sock_owned_by_user(sk)) {
     586           0 :                         sk->sk_err = err;
     587             : 
     588           0 :                         sk->sk_error_report(sk);
     589             : 
     590           0 :                         tcp_done(sk);
     591             :                 } else {
     592           0 :                         sk->sk_err_soft = err;
     593             :                 }
     594           0 :                 goto out;
     595             :         }
     596             : 
     597             :         /* If we've already connected we will keep trying
     598             :          * until we time out, or the user gives up.
     599             :          *
     600             :          * rfc1122 4.2.3.9 allows to consider as hard errors
     601             :          * only PROTO_UNREACH and PORT_UNREACH (well, FRAG_FAILED too,
     602             :          * but it is obsoleted by pmtu discovery).
     603             :          *
     604             :          * Note, that in modern internet, where routing is unreliable
     605             :          * and in each dark corner broken firewalls sit, sending random
     606             :          * errors ordered by their masters even this two messages finally lose
     607             :          * their original sense (even Linux sends invalid PORT_UNREACHs)
     608             :          *
     609             :          * Now we are in compliance with RFCs.
     610             :          *                                                      --ANK (980905)
     611             :          */
     612             : 
     613           0 :         inet = inet_sk(sk);
     614           0 :         if (!sock_owned_by_user(sk) && inet->recverr) {
     615           0 :                 sk->sk_err = err;
     616           0 :                 sk->sk_error_report(sk);
     617             :         } else  { /* Only an error on timeout */
     618           0 :                 sk->sk_err_soft = err;
     619             :         }
     620             : 
     621           0 : out:
     622           0 :         bh_unlock_sock(sk);
     623           0 :         sock_put(sk);
     624           0 :         return 0;
     625             : }
     626             : 
     627         430 : void __tcp_v4_send_check(struct sk_buff *skb, __be32 saddr, __be32 daddr)
     628             : {
     629         430 :         struct tcphdr *th = tcp_hdr(skb);
     630             : 
     631         430 :         th->check = ~tcp_v4_check(skb->len, saddr, daddr, 0);
     632         430 :         skb->csum_start = skb_transport_header(skb) - skb->head;
     633         430 :         skb->csum_offset = offsetof(struct tcphdr, check);
     634         430 : }
     635             : 
     636             : /* This routine computes an IPv4 TCP checksum. */
     637         426 : void tcp_v4_send_check(struct sock *sk, struct sk_buff *skb)
     638             : {
     639         426 :         const struct inet_sock *inet = inet_sk(sk);
     640             : 
     641         426 :         __tcp_v4_send_check(skb, inet->inet_saddr, inet->inet_daddr);
     642         426 : }
     643             : EXPORT_SYMBOL(tcp_v4_send_check);
     644             : 
     645             : /*
     646             :  *      This routine will send an RST to the other tcp.
     647             :  *
     648             :  *      Someone asks: why I NEVER use socket parameters (TOS, TTL etc.)
     649             :  *                    for reset.
     650             :  *      Answer: if a packet caused RST, it is not for a socket
     651             :  *              existing in our system, if it is matched to a socket,
     652             :  *              it is just duplicate segment or bug in other side's TCP.
     653             :  *              So that we build reply only basing on parameters
     654             :  *              arrived with segment.
     655             :  *      Exception: precedence violation. We do not implement it in any case.
     656             :  */
     657             : 
     658           0 : static void tcp_v4_send_reset(const struct sock *sk, struct sk_buff *skb)
     659             : {
     660           0 :         const struct tcphdr *th = tcp_hdr(skb);
     661           0 :         struct {
     662             :                 struct tcphdr th;
     663             : #ifdef CONFIG_TCP_MD5SIG
     664             :                 __be32 opt[(TCPOLEN_MD5SIG_ALIGNED >> 2)];
     665             : #endif
     666             :         } rep;
     667           0 :         struct ip_reply_arg arg;
     668             : #ifdef CONFIG_TCP_MD5SIG
     669             :         struct tcp_md5sig_key *key = NULL;
     670             :         const __u8 *hash_location = NULL;
     671             :         unsigned char newhash[16];
     672             :         int genhash;
     673             :         struct sock *sk1 = NULL;
     674             : #endif
     675           0 :         u64 transmit_time = 0;
     676           0 :         struct sock *ctl_sk;
     677           0 :         struct net *net;
     678             : 
     679             :         /* Never send a reset in response to a reset. */
     680           0 :         if (th->rst)
     681           0 :                 return;
     682             : 
     683             :         /* If sk not NULL, it means we did a successful lookup and incoming
     684             :          * route had to be correct. prequeue might have dropped our dst.
     685             :          */
     686           0 :         if (!sk && skb_rtable(skb)->rt_type != RTN_LOCAL)
     687             :                 return;
     688             : 
     689             :         /* Swap the send and the receive. */
     690           0 :         memset(&rep, 0, sizeof(rep));
     691           0 :         rep.th.dest   = th->source;
     692           0 :         rep.th.source = th->dest;
     693           0 :         rep.th.doff   = sizeof(struct tcphdr) / 4;
     694           0 :         rep.th.rst    = 1;
     695             : 
     696           0 :         if (th->ack) {
     697           0 :                 rep.th.seq = th->ack_seq;
     698             :         } else {
     699           0 :                 rep.th.ack = 1;
     700           0 :                 rep.th.ack_seq = htonl(ntohl(th->seq) + th->syn + th->fin +
     701             :                                        skb->len - (th->doff << 2));
     702             :         }
     703             : 
     704           0 :         memset(&arg, 0, sizeof(arg));
     705           0 :         arg.iov[0].iov_base = (unsigned char *)&rep;
     706           0 :         arg.iov[0].iov_len  = sizeof(rep.th);
     707             : 
     708           0 :         net = sk ? sock_net(sk) : dev_net(skb_dst(skb)->dev);
     709             : #ifdef CONFIG_TCP_MD5SIG
     710             :         rcu_read_lock();
     711             :         hash_location = tcp_parse_md5sig_option(th);
     712             :         if (sk && sk_fullsock(sk)) {
     713             :                 const union tcp_md5_addr *addr;
     714             :                 int l3index;
     715             : 
     716             :                 /* sdif set, means packet ingressed via a device
     717             :                  * in an L3 domain and inet_iif is set to it.
     718             :                  */
     719             :                 l3index = tcp_v4_sdif(skb) ? inet_iif(skb) : 0;
     720             :                 addr = (union tcp_md5_addr *)&ip_hdr(skb)->saddr;
     721             :                 key = tcp_md5_do_lookup(sk, l3index, addr, AF_INET);
     722             :         } else if (hash_location) {
     723             :                 const union tcp_md5_addr *addr;
     724             :                 int sdif = tcp_v4_sdif(skb);
     725             :                 int dif = inet_iif(skb);
     726             :                 int l3index;
     727             : 
     728             :                 /*
     729             :                  * active side is lost. Try to find listening socket through
     730             :                  * source port, and then find md5 key through listening socket.
     731             :                  * we are not loose security here:
     732             :                  * Incoming packet is checked with md5 hash with finding key,
     733             :                  * no RST generated if md5 hash doesn't match.
     734             :                  */
     735             :                 sk1 = __inet_lookup_listener(net, &tcp_hashinfo, NULL, 0,
     736             :                                              ip_hdr(skb)->saddr,
     737             :                                              th->source, ip_hdr(skb)->daddr,
     738             :                                              ntohs(th->source), dif, sdif);
     739             :                 /* don't send rst if it can't find key */
     740             :                 if (!sk1)
     741             :                         goto out;
     742             : 
     743             :                 /* sdif set, means packet ingressed via a device
     744             :                  * in an L3 domain and dif is set to it.
     745             :                  */
     746             :                 l3index = sdif ? dif : 0;
     747             :                 addr = (union tcp_md5_addr *)&ip_hdr(skb)->saddr;
     748             :                 key = tcp_md5_do_lookup(sk1, l3index, addr, AF_INET);
     749             :                 if (!key)
     750             :                         goto out;
     751             : 
     752             : 
     753             :                 genhash = tcp_v4_md5_hash_skb(newhash, key, NULL, skb);
     754             :                 if (genhash || memcmp(hash_location, newhash, 16) != 0)
     755             :                         goto out;
     756             : 
     757             :         }
     758             : 
     759             :         if (key) {
     760             :                 rep.opt[0] = htonl((TCPOPT_NOP << 24) |
     761             :                                    (TCPOPT_NOP << 16) |
     762             :                                    (TCPOPT_MD5SIG << 8) |
     763             :                                    TCPOLEN_MD5SIG);
     764             :                 /* Update length and the length the header thinks exists */
     765             :                 arg.iov[0].iov_len += TCPOLEN_MD5SIG_ALIGNED;
     766             :                 rep.th.doff = arg.iov[0].iov_len / 4;
     767             : 
     768             :                 tcp_v4_md5_hash_hdr((__u8 *) &rep.opt[1],
     769             :                                      key, ip_hdr(skb)->saddr,
     770             :                                      ip_hdr(skb)->daddr, &rep.th);
     771             :         }
     772             : #endif
     773           0 :         arg.csum = csum_tcpudp_nofold(ip_hdr(skb)->daddr,
     774           0 :                                       ip_hdr(skb)->saddr, /* XXX */
     775           0 :                                       arg.iov[0].iov_len, IPPROTO_TCP, 0);
     776           0 :         arg.csumoffset = offsetof(struct tcphdr, check) / 2;
     777           0 :         arg.flags = (sk && inet_sk_transparent(sk)) ? IP_REPLY_ARG_NOSRCCHECK : 0;
     778             : 
     779             :         /* When socket is gone, all binding information is lost.
     780             :          * routing might fail in this case. No choice here, if we choose to force
     781             :          * input interface, we will misroute in case of asymmetric route.
     782             :          */
     783           0 :         if (sk) {
     784           0 :                 arg.bound_dev_if = sk->sk_bound_dev_if;
     785           0 :                 if (sk_fullsock(sk))
     786           0 :                         trace_tcp_send_reset(sk, skb);
     787             :         }
     788             : 
     789           0 :         BUILD_BUG_ON(offsetof(struct sock, sk_bound_dev_if) !=
     790             :                      offsetof(struct inet_timewait_sock, tw_bound_dev_if));
     791             : 
     792           0 :         arg.tos = ip_hdr(skb)->tos;
     793           0 :         arg.uid = sock_net_uid(net, sk && sk_fullsock(sk) ? sk : NULL);
     794           0 :         local_bh_disable();
     795           0 :         ctl_sk = this_cpu_read(*net->ipv4.tcp_sk);
     796           0 :         if (sk) {
     797           0 :                 ctl_sk->sk_mark = (sk->sk_state == TCP_TIME_WAIT) ?
     798           0 :                                    inet_twsk(sk)->tw_mark : sk->sk_mark;
     799           0 :                 ctl_sk->sk_priority = (sk->sk_state == TCP_TIME_WAIT) ?
     800           0 :                                    inet_twsk(sk)->tw_priority : sk->sk_priority;
     801           0 :                 transmit_time = tcp_transmit_time(sk);
     802             :         }
     803           0 :         ip_send_unicast_reply(ctl_sk,
     804           0 :                               skb, &TCP_SKB_CB(skb)->header.h4.opt,
     805           0 :                               ip_hdr(skb)->saddr, ip_hdr(skb)->daddr,
     806           0 :                               &arg, arg.iov[0].iov_len,
     807             :                               transmit_time);
     808             : 
     809           0 :         ctl_sk->sk_mark = 0;
     810           0 :         __TCP_INC_STATS(net, TCP_MIB_OUTSEGS);
     811           0 :         __TCP_INC_STATS(net, TCP_MIB_OUTRSTS);
     812           0 :         local_bh_enable();
     813             : 
     814             : #ifdef CONFIG_TCP_MD5SIG
     815             : out:
     816             :         rcu_read_unlock();
     817             : #endif
     818             : }
     819             : 
     820             : /* The code following below sending ACKs in SYN-RECV and TIME-WAIT states
     821             :    outside socket context is ugly, certainly. What can I do?
     822             :  */
     823             : 
     824           0 : static void tcp_v4_send_ack(const struct sock *sk,
     825             :                             struct sk_buff *skb, u32 seq, u32 ack,
     826             :                             u32 win, u32 tsval, u32 tsecr, int oif,
     827             :                             struct tcp_md5sig_key *key,
     828             :                             int reply_flags, u8 tos)
     829             : {
     830           0 :         const struct tcphdr *th = tcp_hdr(skb);
     831           0 :         struct {
     832             :                 struct tcphdr th;
     833             :                 __be32 opt[(TCPOLEN_TSTAMP_ALIGNED >> 2)
     834             : #ifdef CONFIG_TCP_MD5SIG
     835             :                            + (TCPOLEN_MD5SIG_ALIGNED >> 2)
     836             : #endif
     837             :                         ];
     838             :         } rep;
     839           0 :         struct net *net = sock_net(sk);
     840           0 :         struct ip_reply_arg arg;
     841           0 :         struct sock *ctl_sk;
     842           0 :         u64 transmit_time;
     843             : 
     844           0 :         memset(&rep.th, 0, sizeof(struct tcphdr));
     845           0 :         memset(&arg, 0, sizeof(arg));
     846             : 
     847           0 :         arg.iov[0].iov_base = (unsigned char *)&rep;
     848           0 :         arg.iov[0].iov_len  = sizeof(rep.th);
     849           0 :         if (tsecr) {
     850           0 :                 rep.opt[0] = htonl((TCPOPT_NOP << 24) | (TCPOPT_NOP << 16) |
     851             :                                    (TCPOPT_TIMESTAMP << 8) |
     852             :                                    TCPOLEN_TIMESTAMP);
     853           0 :                 rep.opt[1] = htonl(tsval);
     854           0 :                 rep.opt[2] = htonl(tsecr);
     855           0 :                 arg.iov[0].iov_len += TCPOLEN_TSTAMP_ALIGNED;
     856             :         }
     857             : 
     858             :         /* Swap the send and the receive. */
     859           0 :         rep.th.dest    = th->source;
     860           0 :         rep.th.source  = th->dest;
     861           0 :         rep.th.doff    = arg.iov[0].iov_len / 4;
     862           0 :         rep.th.seq     = htonl(seq);
     863           0 :         rep.th.ack_seq = htonl(ack);
     864           0 :         rep.th.ack     = 1;
     865           0 :         rep.th.window  = htons(win);
     866             : 
     867             : #ifdef CONFIG_TCP_MD5SIG
     868             :         if (key) {
     869             :                 int offset = (tsecr) ? 3 : 0;
     870             : 
     871             :                 rep.opt[offset++] = htonl((TCPOPT_NOP << 24) |
     872             :                                           (TCPOPT_NOP << 16) |
     873             :                                           (TCPOPT_MD5SIG << 8) |
     874             :                                           TCPOLEN_MD5SIG);
     875             :                 arg.iov[0].iov_len += TCPOLEN_MD5SIG_ALIGNED;
     876             :                 rep.th.doff = arg.iov[0].iov_len/4;
     877             : 
     878             :                 tcp_v4_md5_hash_hdr((__u8 *) &rep.opt[offset],
     879             :                                     key, ip_hdr(skb)->saddr,
     880             :                                     ip_hdr(skb)->daddr, &rep.th);
     881             :         }
     882             : #endif
     883           0 :         arg.flags = reply_flags;
     884           0 :         arg.csum = csum_tcpudp_nofold(ip_hdr(skb)->daddr,
     885           0 :                                       ip_hdr(skb)->saddr, /* XXX */
     886             :                                       arg.iov[0].iov_len, IPPROTO_TCP, 0);
     887           0 :         arg.csumoffset = offsetof(struct tcphdr, check) / 2;
     888           0 :         if (oif)
     889           0 :                 arg.bound_dev_if = oif;
     890           0 :         arg.tos = tos;
     891           0 :         arg.uid = sock_net_uid(net, sk_fullsock(sk) ? sk : NULL);
     892           0 :         local_bh_disable();
     893           0 :         ctl_sk = this_cpu_read(*net->ipv4.tcp_sk);
     894           0 :         ctl_sk->sk_mark = (sk->sk_state == TCP_TIME_WAIT) ?
     895           0 :                            inet_twsk(sk)->tw_mark : sk->sk_mark;
     896           0 :         ctl_sk->sk_priority = (sk->sk_state == TCP_TIME_WAIT) ?
     897           0 :                            inet_twsk(sk)->tw_priority : sk->sk_priority;
     898           0 :         transmit_time = tcp_transmit_time(sk);
     899           0 :         ip_send_unicast_reply(ctl_sk,
     900           0 :                               skb, &TCP_SKB_CB(skb)->header.h4.opt,
     901           0 :                               ip_hdr(skb)->saddr, ip_hdr(skb)->daddr,
     902           0 :                               &arg, arg.iov[0].iov_len,
     903             :                               transmit_time);
     904             : 
     905           0 :         ctl_sk->sk_mark = 0;
     906           0 :         __TCP_INC_STATS(net, TCP_MIB_OUTSEGS);
     907           0 :         local_bh_enable();
     908           0 : }
     909             : 
     910           0 : static void tcp_v4_timewait_ack(struct sock *sk, struct sk_buff *skb)
     911             : {
     912           0 :         struct inet_timewait_sock *tw = inet_twsk(sk);
     913           0 :         struct tcp_timewait_sock *tcptw = tcp_twsk(sk);
     914             : 
     915           0 :         tcp_v4_send_ack(sk, skb,
     916             :                         tcptw->tw_snd_nxt, tcptw->tw_rcv_nxt,
     917           0 :                         tcptw->tw_rcv_wnd >> tw->tw_rcv_wscale,
     918           0 :                         tcp_time_stamp_raw() + tcptw->tw_ts_offset,
     919             :                         tcptw->tw_ts_recent,
     920             :                         tw->tw_bound_dev_if,
     921             :                         tcp_twsk_md5_key(tcptw),
     922           0 :                         tw->tw_transparent ? IP_REPLY_ARG_NOSRCCHECK : 0,
     923           0 :                         tw->tw_tos
     924             :                         );
     925             : 
     926           0 :         inet_twsk_put(tw);
     927           0 : }
     928             : 
     929           0 : static void tcp_v4_reqsk_send_ack(const struct sock *sk, struct sk_buff *skb,
     930             :                                   struct request_sock *req)
     931             : {
     932           0 :         const union tcp_md5_addr *addr;
     933           0 :         int l3index;
     934             : 
     935             :         /* sk->sk_state == TCP_LISTEN -> for regular TCP_SYN_RECV
     936             :          * sk->sk_state == TCP_SYN_RECV -> for Fast Open.
     937             :          */
     938           0 :         u32 seq = (sk->sk_state == TCP_LISTEN) ? tcp_rsk(req)->snt_isn + 1 :
     939           0 :                                              tcp_sk(sk)->snd_nxt;
     940             : 
     941             :         /* RFC 7323 2.3
     942             :          * The window field (SEG.WND) of every outgoing segment, with the
     943             :          * exception of <SYN> segments, MUST be right-shifted by
     944             :          * Rcv.Wind.Shift bits:
     945             :          */
     946           0 :         addr = (union tcp_md5_addr *)&ip_hdr(skb)->saddr;
     947           0 :         l3index = tcp_v4_sdif(skb) ? inet_iif(skb) : 0;
     948           0 :         tcp_v4_send_ack(sk, skb, seq,
     949           0 :                         tcp_rsk(req)->rcv_nxt,
     950           0 :                         req->rsk_rcv_wnd >> inet_rsk(req)->rcv_wscale,
     951           0 :                         tcp_time_stamp_raw() + tcp_rsk(req)->ts_off,
     952             :                         req->ts_recent,
     953             :                         0,
     954             :                         tcp_md5_do_lookup(sk, l3index, addr, AF_INET),
     955           0 :                         inet_rsk(req)->no_srccheck ? IP_REPLY_ARG_NOSRCCHECK : 0,
     956           0 :                         ip_hdr(skb)->tos);
     957           0 : }
     958             : 
     959             : /*
     960             :  *      Send a SYN-ACK after having received a SYN.
     961             :  *      This still operates on a request_sock only, not on a big
     962             :  *      socket.
     963             :  */
     964           4 : static int tcp_v4_send_synack(const struct sock *sk, struct dst_entry *dst,
     965             :                               struct flowi *fl,
     966             :                               struct request_sock *req,
     967             :                               struct tcp_fastopen_cookie *foc,
     968             :                               enum tcp_synack_type synack_type,
     969             :                               struct sk_buff *syn_skb)
     970             : {
     971           4 :         const struct inet_request_sock *ireq = inet_rsk(req);
     972           4 :         struct flowi4 fl4;
     973           4 :         int err = -1;
     974           4 :         struct sk_buff *skb;
     975           4 :         u8 tos;
     976             : 
     977             :         /* First, grab a route. */
     978           4 :         if (!dst && (dst = inet_csk_route_req(sk, &fl4, req)) == NULL)
     979             :                 return -1;
     980             : 
     981           4 :         skb = tcp_make_synack(sk, dst, req, foc, synack_type, syn_skb);
     982             : 
     983           4 :         if (skb) {
     984           4 :                 __tcp_v4_send_check(skb, ireq->ir_loc_addr, ireq->ir_rmt_addr);
     985             : 
     986           4 :                 tos = sock_net(sk)->ipv4.sysctl_tcp_reflect_tos ?
     987           0 :                                 (tcp_rsk(req)->syn_tos & ~INET_ECN_MASK) |
     988           0 :                                 (inet_sk(sk)->tos & INET_ECN_MASK) :
     989           4 :                                 inet_sk(sk)->tos;
     990             : 
     991           8 :                 if (!INET_ECN_is_capable(tos) &&
     992           4 :                     tcp_bpf_ca_needs_ecn((struct sock *)req))
     993           0 :                         tos |= INET_ECN_ECT_0;
     994             : 
     995           4 :                 rcu_read_lock();
     996          12 :                 err = ip_build_and_send_pkt(skb, sk, ireq->ir_loc_addr,
     997             :                                             ireq->ir_rmt_addr,
     998           4 :                                             rcu_dereference(ireq->ireq_opt),
     999             :                                             tos);
    1000           4 :                 rcu_read_unlock();
    1001           4 :                 err = net_xmit_eval(err);
    1002             :         }
    1003             : 
    1004             :         return err;
    1005             : }
    1006             : 
    1007             : /*
    1008             :  *      IPv4 request_sock destructor.
    1009             :  */
    1010           4 : static void tcp_v4_reqsk_destructor(struct request_sock *req)
    1011             : {
    1012           4 :         kfree(rcu_dereference_protected(inet_rsk(req)->ireq_opt, 1));
    1013           4 : }
    1014             : 
    1015             : #ifdef CONFIG_TCP_MD5SIG
    1016             : /*
    1017             :  * RFC2385 MD5 checksumming requires a mapping of
    1018             :  * IP address->MD5 Key.
    1019             :  * We need to maintain these in the sk structure.
    1020             :  */
    1021             : 
    1022             : DEFINE_STATIC_KEY_FALSE(tcp_md5_needed);
    1023             : EXPORT_SYMBOL(tcp_md5_needed);
    1024             : 
    1025             : /* Find the Key structure for an address.  */
    1026             : struct tcp_md5sig_key *__tcp_md5_do_lookup(const struct sock *sk, int l3index,
    1027             :                                            const union tcp_md5_addr *addr,
    1028             :                                            int family)
    1029             : {
    1030             :         const struct tcp_sock *tp = tcp_sk(sk);
    1031             :         struct tcp_md5sig_key *key;
    1032             :         const struct tcp_md5sig_info *md5sig;
    1033             :         __be32 mask;
    1034             :         struct tcp_md5sig_key *best_match = NULL;
    1035             :         bool match;
    1036             : 
    1037             :         /* caller either holds rcu_read_lock() or socket lock */
    1038             :         md5sig = rcu_dereference_check(tp->md5sig_info,
    1039             :                                        lockdep_sock_is_held(sk));
    1040             :         if (!md5sig)
    1041             :                 return NULL;
    1042             : 
    1043             :         hlist_for_each_entry_rcu(key, &md5sig->head, node,
    1044             :                                  lockdep_sock_is_held(sk)) {
    1045             :                 if (key->family != family)
    1046             :                         continue;
    1047             :                 if (key->l3index && key->l3index != l3index)
    1048             :                         continue;
    1049             :                 if (family == AF_INET) {
    1050             :                         mask = inet_make_mask(key->prefixlen);
    1051             :                         match = (key->addr.a4.s_addr & mask) ==
    1052             :                                 (addr->a4.s_addr & mask);
    1053             : #if IS_ENABLED(CONFIG_IPV6)
    1054             :                 } else if (family == AF_INET6) {
    1055             :                         match = ipv6_prefix_equal(&key->addr.a6, &addr->a6,
    1056             :                                                   key->prefixlen);
    1057             : #endif
    1058             :                 } else {
    1059             :                         match = false;
    1060             :                 }
    1061             : 
    1062             :                 if (match && (!best_match ||
    1063             :                               key->prefixlen > best_match->prefixlen))
    1064             :                         best_match = key;
    1065             :         }
    1066             :         return best_match;
    1067             : }
    1068             : EXPORT_SYMBOL(__tcp_md5_do_lookup);
    1069             : 
    1070             : static struct tcp_md5sig_key *tcp_md5_do_lookup_exact(const struct sock *sk,
    1071             :                                                       const union tcp_md5_addr *addr,
    1072             :                                                       int family, u8 prefixlen,
    1073             :                                                       int l3index)
    1074             : {
    1075             :         const struct tcp_sock *tp = tcp_sk(sk);
    1076             :         struct tcp_md5sig_key *key;
    1077             :         unsigned int size = sizeof(struct in_addr);
    1078             :         const struct tcp_md5sig_info *md5sig;
    1079             : 
    1080             :         /* caller either holds rcu_read_lock() or socket lock */
    1081             :         md5sig = rcu_dereference_check(tp->md5sig_info,
    1082             :                                        lockdep_sock_is_held(sk));
    1083             :         if (!md5sig)
    1084             :                 return NULL;
    1085             : #if IS_ENABLED(CONFIG_IPV6)
    1086             :         if (family == AF_INET6)
    1087             :                 size = sizeof(struct in6_addr);
    1088             : #endif
    1089             :         hlist_for_each_entry_rcu(key, &md5sig->head, node,
    1090             :                                  lockdep_sock_is_held(sk)) {
    1091             :                 if (key->family != family)
    1092             :                         continue;
    1093             :                 if (key->l3index && key->l3index != l3index)
    1094             :                         continue;
    1095             :                 if (!memcmp(&key->addr, addr, size) &&
    1096             :                     key->prefixlen == prefixlen)
    1097             :                         return key;
    1098             :         }
    1099             :         return NULL;
    1100             : }
    1101             : 
    1102             : struct tcp_md5sig_key *tcp_v4_md5_lookup(const struct sock *sk,
    1103             :                                          const struct sock *addr_sk)
    1104             : {
    1105             :         const union tcp_md5_addr *addr;
    1106             :         int l3index;
    1107             : 
    1108             :         l3index = l3mdev_master_ifindex_by_index(sock_net(sk),
    1109             :                                                  addr_sk->sk_bound_dev_if);
    1110             :         addr = (const union tcp_md5_addr *)&addr_sk->sk_daddr;
    1111             :         return tcp_md5_do_lookup(sk, l3index, addr, AF_INET);
    1112             : }
    1113             : EXPORT_SYMBOL(tcp_v4_md5_lookup);
    1114             : 
    1115             : /* This can be called on a newly created socket, from other files */
    1116             : int tcp_md5_do_add(struct sock *sk, const union tcp_md5_addr *addr,
    1117             :                    int family, u8 prefixlen, int l3index,
    1118             :                    const u8 *newkey, u8 newkeylen, gfp_t gfp)
    1119             : {
    1120             :         /* Add Key to the list */
    1121             :         struct tcp_md5sig_key *key;
    1122             :         struct tcp_sock *tp = tcp_sk(sk);
    1123             :         struct tcp_md5sig_info *md5sig;
    1124             : 
    1125             :         key = tcp_md5_do_lookup_exact(sk, addr, family, prefixlen, l3index);
    1126             :         if (key) {
    1127             :                 /* Pre-existing entry - just update that one.
    1128             :                  * Note that the key might be used concurrently.
    1129             :                  * data_race() is telling kcsan that we do not care of
    1130             :                  * key mismatches, since changing MD5 key on live flows
    1131             :                  * can lead to packet drops.
    1132             :                  */
    1133             :                 data_race(memcpy(key->key, newkey, newkeylen));
    1134             : 
    1135             :                 /* Pairs with READ_ONCE() in tcp_md5_hash_key().
    1136             :                  * Also note that a reader could catch new key->keylen value
    1137             :                  * but old key->key[], this is the reason we use __GFP_ZERO
    1138             :                  * at sock_kmalloc() time below these lines.
    1139             :                  */
    1140             :                 WRITE_ONCE(key->keylen, newkeylen);
    1141             : 
    1142             :                 return 0;
    1143             :         }
    1144             : 
    1145             :         md5sig = rcu_dereference_protected(tp->md5sig_info,
    1146             :                                            lockdep_sock_is_held(sk));
    1147             :         if (!md5sig) {
    1148             :                 md5sig = kmalloc(sizeof(*md5sig), gfp);
    1149             :                 if (!md5sig)
    1150             :                         return -ENOMEM;
    1151             : 
    1152             :                 sk_nocaps_add(sk, NETIF_F_GSO_MASK);
    1153             :                 INIT_HLIST_HEAD(&md5sig->head);
    1154             :                 rcu_assign_pointer(tp->md5sig_info, md5sig);
    1155             :         }
    1156             : 
    1157             :         key = sock_kmalloc(sk, sizeof(*key), gfp | __GFP_ZERO);
    1158             :         if (!key)
    1159             :                 return -ENOMEM;
    1160             :         if (!tcp_alloc_md5sig_pool()) {
    1161             :                 sock_kfree_s(sk, key, sizeof(*key));
    1162             :                 return -ENOMEM;
    1163             :         }
    1164             : 
    1165             :         memcpy(key->key, newkey, newkeylen);
    1166             :         key->keylen = newkeylen;
    1167             :         key->family = family;
    1168             :         key->prefixlen = prefixlen;
    1169             :         key->l3index = l3index;
    1170             :         memcpy(&key->addr, addr,
    1171             :                (family == AF_INET6) ? sizeof(struct in6_addr) :
    1172             :                                       sizeof(struct in_addr));
    1173             :         hlist_add_head_rcu(&key->node, &md5sig->head);
    1174             :         return 0;
    1175             : }
    1176             : EXPORT_SYMBOL(tcp_md5_do_add);
    1177             : 
    1178             : int tcp_md5_do_del(struct sock *sk, const union tcp_md5_addr *addr, int family,
    1179             :                    u8 prefixlen, int l3index)
    1180             : {
    1181             :         struct tcp_md5sig_key *key;
    1182             : 
    1183             :         key = tcp_md5_do_lookup_exact(sk, addr, family, prefixlen, l3index);
    1184             :         if (!key)
    1185             :                 return -ENOENT;
    1186             :         hlist_del_rcu(&key->node);
    1187             :         atomic_sub(sizeof(*key), &sk->sk_omem_alloc);
    1188             :         kfree_rcu(key, rcu);
    1189             :         return 0;
    1190             : }
    1191             : EXPORT_SYMBOL(tcp_md5_do_del);
    1192             : 
    1193             : static void tcp_clear_md5_list(struct sock *sk)
    1194             : {
    1195             :         struct tcp_sock *tp = tcp_sk(sk);
    1196             :         struct tcp_md5sig_key *key;
    1197             :         struct hlist_node *n;
    1198             :         struct tcp_md5sig_info *md5sig;
    1199             : 
    1200             :         md5sig = rcu_dereference_protected(tp->md5sig_info, 1);
    1201             : 
    1202             :         hlist_for_each_entry_safe(key, n, &md5sig->head, node) {
    1203             :                 hlist_del_rcu(&key->node);
    1204             :                 atomic_sub(sizeof(*key), &sk->sk_omem_alloc);
    1205             :                 kfree_rcu(key, rcu);
    1206             :         }
    1207             : }
    1208             : 
    1209             : static int tcp_v4_parse_md5_keys(struct sock *sk, int optname,
    1210             :                                  sockptr_t optval, int optlen)
    1211             : {
    1212             :         struct tcp_md5sig cmd;
    1213             :         struct sockaddr_in *sin = (struct sockaddr_in *)&cmd.tcpm_addr;
    1214             :         const union tcp_md5_addr *addr;
    1215             :         u8 prefixlen = 32;
    1216             :         int l3index = 0;
    1217             : 
    1218             :         if (optlen < sizeof(cmd))
    1219             :                 return -EINVAL;
    1220             : 
    1221             :         if (copy_from_sockptr(&cmd, optval, sizeof(cmd)))
    1222             :                 return -EFAULT;
    1223             : 
    1224             :         if (sin->sin_family != AF_INET)
    1225             :                 return -EINVAL;
    1226             : 
    1227             :         if (optname == TCP_MD5SIG_EXT &&
    1228             :             cmd.tcpm_flags & TCP_MD5SIG_FLAG_PREFIX) {
    1229             :                 prefixlen = cmd.tcpm_prefixlen;
    1230             :                 if (prefixlen > 32)
    1231             :                         return -EINVAL;
    1232             :         }
    1233             : 
    1234             :         if (optname == TCP_MD5SIG_EXT &&
    1235             :             cmd.tcpm_flags & TCP_MD5SIG_FLAG_IFINDEX) {
    1236             :                 struct net_device *dev;
    1237             : 
    1238             :                 rcu_read_lock();
    1239             :                 dev = dev_get_by_index_rcu(sock_net(sk), cmd.tcpm_ifindex);
    1240             :                 if (dev && netif_is_l3_master(dev))
    1241             :                         l3index = dev->ifindex;
    1242             : 
    1243             :                 rcu_read_unlock();
    1244             : 
    1245             :                 /* ok to reference set/not set outside of rcu;
    1246             :                  * right now device MUST be an L3 master
    1247             :                  */
    1248             :                 if (!dev || !l3index)
    1249             :                         return -EINVAL;
    1250             :         }
    1251             : 
    1252             :         addr = (union tcp_md5_addr *)&sin->sin_addr.s_addr;
    1253             : 
    1254             :         if (!cmd.tcpm_keylen)
    1255             :                 return tcp_md5_do_del(sk, addr, AF_INET, prefixlen, l3index);
    1256             : 
    1257             :         if (cmd.tcpm_keylen > TCP_MD5SIG_MAXKEYLEN)
    1258             :                 return -EINVAL;
    1259             : 
    1260             :         return tcp_md5_do_add(sk, addr, AF_INET, prefixlen, l3index,
    1261             :                               cmd.tcpm_key, cmd.tcpm_keylen, GFP_KERNEL);
    1262             : }
    1263             : 
    1264             : static int tcp_v4_md5_hash_headers(struct tcp_md5sig_pool *hp,
    1265             :                                    __be32 daddr, __be32 saddr,
    1266             :                                    const struct tcphdr *th, int nbytes)
    1267             : {
    1268             :         struct tcp4_pseudohdr *bp;
    1269             :         struct scatterlist sg;
    1270             :         struct tcphdr *_th;
    1271             : 
    1272             :         bp = hp->scratch;
    1273             :         bp->saddr = saddr;
    1274             :         bp->daddr = daddr;
    1275             :         bp->pad = 0;
    1276             :         bp->protocol = IPPROTO_TCP;
    1277             :         bp->len = cpu_to_be16(nbytes);
    1278             : 
    1279             :         _th = (struct tcphdr *)(bp + 1);
    1280             :         memcpy(_th, th, sizeof(*th));
    1281             :         _th->check = 0;
    1282             : 
    1283             :         sg_init_one(&sg, bp, sizeof(*bp) + sizeof(*th));
    1284             :         ahash_request_set_crypt(hp->md5_req, &sg, NULL,
    1285             :                                 sizeof(*bp) + sizeof(*th));
    1286             :         return crypto_ahash_update(hp->md5_req);
    1287             : }
    1288             : 
    1289             : static int tcp_v4_md5_hash_hdr(char *md5_hash, const struct tcp_md5sig_key *key,
    1290             :                                __be32 daddr, __be32 saddr, const struct tcphdr *th)
    1291             : {
    1292             :         struct tcp_md5sig_pool *hp;
    1293             :         struct ahash_request *req;
    1294             : 
    1295             :         hp = tcp_get_md5sig_pool();
    1296             :         if (!hp)
    1297             :                 goto clear_hash_noput;
    1298             :         req = hp->md5_req;
    1299             : 
    1300             :         if (crypto_ahash_init(req))
    1301             :                 goto clear_hash;
    1302             :         if (tcp_v4_md5_hash_headers(hp, daddr, saddr, th, th->doff << 2))
    1303             :                 goto clear_hash;
    1304             :         if (tcp_md5_hash_key(hp, key))
    1305             :                 goto clear_hash;
    1306             :         ahash_request_set_crypt(req, NULL, md5_hash, 0);
    1307             :         if (crypto_ahash_final(req))
    1308             :                 goto clear_hash;
    1309             : 
    1310             :         tcp_put_md5sig_pool();
    1311             :         return 0;
    1312             : 
    1313             : clear_hash:
    1314             :         tcp_put_md5sig_pool();
    1315             : clear_hash_noput:
    1316             :         memset(md5_hash, 0, 16);
    1317             :         return 1;
    1318             : }
    1319             : 
    1320             : int tcp_v4_md5_hash_skb(char *md5_hash, const struct tcp_md5sig_key *key,
    1321             :                         const struct sock *sk,
    1322             :                         const struct sk_buff *skb)
    1323             : {
    1324             :         struct tcp_md5sig_pool *hp;
    1325             :         struct ahash_request *req;
    1326             :         const struct tcphdr *th = tcp_hdr(skb);
    1327             :         __be32 saddr, daddr;
    1328             : 
    1329             :         if (sk) { /* valid for establish/request sockets */
    1330             :                 saddr = sk->sk_rcv_saddr;
    1331             :                 daddr = sk->sk_daddr;
    1332             :         } else {
    1333             :                 const struct iphdr *iph = ip_hdr(skb);
    1334             :                 saddr = iph->saddr;
    1335             :                 daddr = iph->daddr;
    1336             :         }
    1337             : 
    1338             :         hp = tcp_get_md5sig_pool();
    1339             :         if (!hp)
    1340             :                 goto clear_hash_noput;
    1341             :         req = hp->md5_req;
    1342             : 
    1343             :         if (crypto_ahash_init(req))
    1344             :                 goto clear_hash;
    1345             : 
    1346             :         if (tcp_v4_md5_hash_headers(hp, daddr, saddr, th, skb->len))
    1347             :                 goto clear_hash;
    1348             :         if (tcp_md5_hash_skb_data(hp, skb, th->doff << 2))
    1349             :                 goto clear_hash;
    1350             :         if (tcp_md5_hash_key(hp, key))
    1351             :                 goto clear_hash;
    1352             :         ahash_request_set_crypt(req, NULL, md5_hash, 0);
    1353             :         if (crypto_ahash_final(req))
    1354             :                 goto clear_hash;
    1355             : 
    1356             :         tcp_put_md5sig_pool();
    1357             :         return 0;
    1358             : 
    1359             : clear_hash:
    1360             :         tcp_put_md5sig_pool();
    1361             : clear_hash_noput:
    1362             :         memset(md5_hash, 0, 16);
    1363             :         return 1;
    1364             : }
    1365             : EXPORT_SYMBOL(tcp_v4_md5_hash_skb);
    1366             : 
    1367             : #endif
    1368             : 
    1369             : /* Called with rcu_read_lock() */
    1370         438 : static bool tcp_v4_inbound_md5_hash(const struct sock *sk,
    1371             :                                     const struct sk_buff *skb,
    1372             :                                     int dif, int sdif)
    1373             : {
    1374             : #ifdef CONFIG_TCP_MD5SIG
    1375             :         /*
    1376             :          * This gets called for each TCP segment that arrives
    1377             :          * so we want to be efficient.
    1378             :          * We have 3 drop cases:
    1379             :          * o No MD5 hash and one expected.
    1380             :          * o MD5 hash and we're not expecting one.
    1381             :          * o MD5 hash and its wrong.
    1382             :          */
    1383             :         const __u8 *hash_location = NULL;
    1384             :         struct tcp_md5sig_key *hash_expected;
    1385             :         const struct iphdr *iph = ip_hdr(skb);
    1386             :         const struct tcphdr *th = tcp_hdr(skb);
    1387             :         const union tcp_md5_addr *addr;
    1388             :         unsigned char newhash[16];
    1389             :         int genhash, l3index;
    1390             : 
    1391             :         /* sdif set, means packet ingressed via a device
    1392             :          * in an L3 domain and dif is set to the l3mdev
    1393             :          */
    1394             :         l3index = sdif ? dif : 0;
    1395             : 
    1396             :         addr = (union tcp_md5_addr *)&iph->saddr;
    1397             :         hash_expected = tcp_md5_do_lookup(sk, l3index, addr, AF_INET);
    1398             :         hash_location = tcp_parse_md5sig_option(th);
    1399             : 
    1400             :         /* We've parsed the options - do we have a hash? */
    1401             :         if (!hash_expected && !hash_location)
    1402             :                 return false;
    1403             : 
    1404             :         if (hash_expected && !hash_location) {
    1405             :                 NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPMD5NOTFOUND);
    1406             :                 return true;
    1407             :         }
    1408             : 
    1409             :         if (!hash_expected && hash_location) {
    1410             :                 NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPMD5UNEXPECTED);
    1411             :                 return true;
    1412             :         }
    1413             : 
    1414             :         /* Okay, so this is hash_expected and hash_location -
    1415             :          * so we need to calculate the checksum.
    1416             :          */
    1417             :         genhash = tcp_v4_md5_hash_skb(newhash,
    1418             :                                       hash_expected,
    1419             :                                       NULL, skb);
    1420             : 
    1421             :         if (genhash || memcmp(hash_location, newhash, 16) != 0) {
    1422             :                 NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPMD5FAILURE);
    1423             :                 net_info_ratelimited("MD5 Hash failed for (%pI4, %d)->(%pI4, %d)%s L3 index %d\n",
    1424             :                                      &iph->saddr, ntohs(th->source),
    1425             :                                      &iph->daddr, ntohs(th->dest),
    1426             :                                      genhash ? " tcp_v4_calc_md5_hash failed"
    1427             :                                      : "", l3index);
    1428             :                 return true;
    1429             :         }
    1430             :         return false;
    1431             : #endif
    1432         438 :         return false;
    1433             : }
    1434             : 
    1435           4 : static void tcp_v4_init_req(struct request_sock *req,
    1436             :                             const struct sock *sk_listener,
    1437             :                             struct sk_buff *skb)
    1438             : {
    1439           4 :         struct inet_request_sock *ireq = inet_rsk(req);
    1440           4 :         struct net *net = sock_net(sk_listener);
    1441             : 
    1442           4 :         sk_rcv_saddr_set(req_to_sk(req), ip_hdr(skb)->daddr);
    1443           4 :         sk_daddr_set(req_to_sk(req), ip_hdr(skb)->saddr);
    1444           4 :         RCU_INIT_POINTER(ireq->ireq_opt, tcp_v4_save_options(net, skb));
    1445           4 : }
    1446             : 
    1447           4 : static struct dst_entry *tcp_v4_route_req(const struct sock *sk,
    1448             :                                           struct sk_buff *skb,
    1449             :                                           struct flowi *fl,
    1450             :                                           struct request_sock *req)
    1451             : {
    1452           4 :         tcp_v4_init_req(req, sk, skb);
    1453             : 
    1454           4 :         if (security_inet_conn_request(sk, skb, req))
    1455             :                 return NULL;
    1456             : 
    1457           4 :         return inet_csk_route_req(sk, &fl->u.ip4, req);
    1458             : }
    1459             : 
    1460             : struct request_sock_ops tcp_request_sock_ops __read_mostly = {
    1461             :         .family         =       PF_INET,
    1462             :         .obj_size       =       sizeof(struct tcp_request_sock),
    1463             :         .rtx_syn_ack    =       tcp_rtx_synack,
    1464             :         .send_ack       =       tcp_v4_reqsk_send_ack,
    1465             :         .destructor     =       tcp_v4_reqsk_destructor,
    1466             :         .send_reset     =       tcp_v4_send_reset,
    1467             :         .syn_ack_timeout =      tcp_syn_ack_timeout,
    1468             : };
    1469             : 
    1470             : const struct tcp_request_sock_ops tcp_request_sock_ipv4_ops = {
    1471             :         .mss_clamp      =       TCP_MSS_DEFAULT,
    1472             : #ifdef CONFIG_TCP_MD5SIG
    1473             :         .req_md5_lookup =       tcp_v4_md5_lookup,
    1474             :         .calc_md5_hash  =       tcp_v4_md5_hash_skb,
    1475             : #endif
    1476             : #ifdef CONFIG_SYN_COOKIES
    1477             :         .cookie_init_seq =      cookie_v4_init_sequence,
    1478             : #endif
    1479             :         .route_req      =       tcp_v4_route_req,
    1480             :         .init_seq       =       tcp_v4_init_seq,
    1481             :         .init_ts_off    =       tcp_v4_init_ts_off,
    1482             :         .send_synack    =       tcp_v4_send_synack,
    1483             : };
    1484             : 
    1485           4 : int tcp_v4_conn_request(struct sock *sk, struct sk_buff *skb)
    1486             : {
    1487             :         /* Never answer to SYNs send to broadcast or multicast */
    1488           4 :         if (skb_rtable(skb)->rt_flags & (RTCF_BROADCAST | RTCF_MULTICAST))
    1489           0 :                 goto drop;
    1490             : 
    1491           4 :         return tcp_conn_request(&tcp_request_sock_ops,
    1492             :                                 &tcp_request_sock_ipv4_ops, sk, skb);
    1493             : 
    1494           0 : drop:
    1495           0 :         tcp_listendrop(sk);
    1496           0 :         return 0;
    1497             : }
    1498             : EXPORT_SYMBOL(tcp_v4_conn_request);
    1499             : 
    1500             : 
    1501             : /*
    1502             :  * The three way handshake has completed - we got a valid synack -
    1503             :  * now create the new socket.
    1504             :  */
    1505           4 : struct sock *tcp_v4_syn_recv_sock(const struct sock *sk, struct sk_buff *skb,
    1506             :                                   struct request_sock *req,
    1507             :                                   struct dst_entry *dst,
    1508             :                                   struct request_sock *req_unhash,
    1509             :                                   bool *own_req)
    1510             : {
    1511           4 :         struct inet_request_sock *ireq;
    1512           4 :         bool found_dup_sk = false;
    1513           4 :         struct inet_sock *newinet;
    1514           4 :         struct tcp_sock *newtp;
    1515           4 :         struct sock *newsk;
    1516             : #ifdef CONFIG_TCP_MD5SIG
    1517             :         const union tcp_md5_addr *addr;
    1518             :         struct tcp_md5sig_key *key;
    1519             :         int l3index;
    1520             : #endif
    1521           4 :         struct ip_options_rcu *inet_opt;
    1522             : 
    1523           4 :         if (sk_acceptq_is_full(sk))
    1524           0 :                 goto exit_overflow;
    1525             : 
    1526           4 :         newsk = tcp_create_openreq_child(sk, req, skb);
    1527           4 :         if (!newsk)
    1528           0 :                 goto exit_nonewsk;
    1529             : 
    1530           4 :         newsk->sk_gso_type = SKB_GSO_TCPV4;
    1531           4 :         inet_sk_rx_dst_set(newsk, skb);
    1532             : 
    1533           4 :         newtp                 = tcp_sk(newsk);
    1534           4 :         newinet               = inet_sk(newsk);
    1535           4 :         ireq                  = inet_rsk(req);
    1536           4 :         sk_daddr_set(newsk, ireq->ir_rmt_addr);
    1537           4 :         sk_rcv_saddr_set(newsk, ireq->ir_loc_addr);
    1538           4 :         newsk->sk_bound_dev_if = ireq->ir_iif;
    1539           4 :         newinet->inet_saddr   = ireq->ir_loc_addr;
    1540           4 :         inet_opt              = rcu_dereference(ireq->ireq_opt);
    1541           4 :         RCU_INIT_POINTER(newinet->inet_opt, inet_opt);
    1542           4 :         newinet->mc_index     = inet_iif(skb);
    1543           4 :         newinet->mc_ttl            = ip_hdr(skb)->ttl;
    1544           4 :         newinet->rcv_tos      = ip_hdr(skb)->tos;
    1545           4 :         inet_csk(newsk)->icsk_ext_hdr_len = 0;
    1546           4 :         if (inet_opt)
    1547           0 :                 inet_csk(newsk)->icsk_ext_hdr_len = inet_opt->opt.optlen;
    1548           4 :         newinet->inet_id = prandom_u32();
    1549             : 
    1550             :         /* Set ToS of the new socket based upon the value of incoming SYN.
    1551             :          * ECT bits are set later in tcp_init_transfer().
    1552             :          */
    1553           4 :         if (sock_net(sk)->ipv4.sysctl_tcp_reflect_tos)
    1554           0 :                 newinet->tos = tcp_rsk(req)->syn_tos & ~INET_ECN_MASK;
    1555             : 
    1556           4 :         if (!dst) {
    1557           4 :                 dst = inet_csk_route_child_sock(sk, newsk, req);
    1558           4 :                 if (!dst)
    1559           0 :                         goto put_and_exit;
    1560             :         } else {
    1561             :                 /* syncookie case : see end of cookie_v4_check() */
    1562           4 :         }
    1563           4 :         sk_setup_caps(newsk, dst);
    1564             : 
    1565           4 :         tcp_ca_openreq_child(newsk, dst);
    1566             : 
    1567           4 :         tcp_sync_mss(newsk, dst_mtu(dst));
    1568           4 :         newtp->advmss = tcp_mss_clamp(tcp_sk(sk), dst_metric_advmss(dst));
    1569             : 
    1570           4 :         tcp_initialize_rcv_mss(newsk);
    1571             : 
    1572             : #ifdef CONFIG_TCP_MD5SIG
    1573             :         l3index = l3mdev_master_ifindex_by_index(sock_net(sk), ireq->ir_iif);
    1574             :         /* Copy over the MD5 key from the original socket */
    1575             :         addr = (union tcp_md5_addr *)&newinet->inet_daddr;
    1576             :         key = tcp_md5_do_lookup(sk, l3index, addr, AF_INET);
    1577             :         if (key) {
    1578             :                 /*
    1579             :                  * We're using one, so create a matching key
    1580             :                  * on the newsk structure. If we fail to get
    1581             :                  * memory, then we end up not copying the key
    1582             :                  * across. Shucks.
    1583             :                  */
    1584             :                 tcp_md5_do_add(newsk, addr, AF_INET, 32, l3index,
    1585             :                                key->key, key->keylen, GFP_ATOMIC);
    1586             :                 sk_nocaps_add(newsk, NETIF_F_GSO_MASK);
    1587             :         }
    1588             : #endif
    1589             : 
    1590           4 :         if (__inet_inherit_port(sk, newsk) < 0)
    1591           0 :                 goto put_and_exit;
    1592           4 :         *own_req = inet_ehash_nolisten(newsk, req_to_sk(req_unhash),
    1593             :                                        &found_dup_sk);
    1594           4 :         if (likely(*own_req)) {
    1595           4 :                 tcp_move_syn(newtp, req);
    1596           4 :                 ireq->ireq_opt = NULL;
    1597             :         } else {
    1598           0 :                 newinet->inet_opt = NULL;
    1599             : 
    1600           0 :                 if (!req_unhash && found_dup_sk) {
    1601             :                         /* This code path should only be executed in the
    1602             :                          * syncookie case only
    1603             :                          */
    1604           0 :                         bh_unlock_sock(newsk);
    1605           0 :                         sock_put(newsk);
    1606           0 :                         newsk = NULL;
    1607             :                 }
    1608             :         }
    1609             :         return newsk;
    1610             : 
    1611           0 : exit_overflow:
    1612           0 :         NET_INC_STATS(sock_net(sk), LINUX_MIB_LISTENOVERFLOWS);
    1613           0 : exit_nonewsk:
    1614           0 :         dst_release(dst);
    1615           0 : exit:
    1616           0 :         tcp_listendrop(sk);
    1617           0 :         return NULL;
    1618           0 : put_and_exit:
    1619           0 :         newinet->inet_opt = NULL;
    1620           0 :         inet_csk_prepare_forced_close(newsk);
    1621           0 :         tcp_done(newsk);
    1622           0 :         goto exit;
    1623             : }
    1624             : EXPORT_SYMBOL(tcp_v4_syn_recv_sock);
    1625             : 
    1626           4 : static struct sock *tcp_v4_cookie_check(struct sock *sk, struct sk_buff *skb)
    1627             : {
    1628             : #ifdef CONFIG_SYN_COOKIES
    1629             :         const struct tcphdr *th = tcp_hdr(skb);
    1630             : 
    1631             :         if (!th->syn)
    1632             :                 sk = cookie_v4_check(sk, skb);
    1633             : #endif
    1634           4 :         return sk;
    1635             : }
    1636             : 
    1637           0 : u16 tcp_v4_get_syncookie(struct sock *sk, struct iphdr *iph,
    1638             :                          struct tcphdr *th, u32 *cookie)
    1639             : {
    1640           0 :         u16 mss = 0;
    1641             : #ifdef CONFIG_SYN_COOKIES
    1642             :         mss = tcp_get_syncookie_mss(&tcp_request_sock_ops,
    1643             :                                     &tcp_request_sock_ipv4_ops, sk, th);
    1644             :         if (mss) {
    1645             :                 *cookie = __cookie_v4_init_sequence(iph, th, &mss);
    1646             :                 tcp_synq_overflow(sk);
    1647             :         }
    1648             : #endif
    1649           0 :         return mss;
    1650             : }
    1651             : 
    1652             : INDIRECT_CALLABLE_DECLARE(struct dst_entry *ipv4_dst_check(struct dst_entry *,
    1653             :                                                            u32));
    1654             : /* The socket must have it's spinlock held when we get
    1655             :  * here, unless it is a TCP_LISTEN socket.
    1656             :  *
    1657             :  * We have a potential double-lock case here, so even when
    1658             :  * doing backlog processing we use the BH locking scheme.
    1659             :  * This is because we cannot sleep with the original spinlock
    1660             :  * held.
    1661             :  */
    1662         419 : int tcp_v4_do_rcv(struct sock *sk, struct sk_buff *skb)
    1663             : {
    1664         419 :         struct sock *rsk;
    1665             : 
    1666         419 :         if (sk->sk_state == TCP_ESTABLISHED) { /* Fast path */
    1667         412 :                 struct dst_entry *dst = sk->sk_rx_dst;
    1668             : 
    1669         412 :                 sock_rps_save_rxhash(sk, skb);
    1670         412 :                 sk_mark_napi_id(sk, skb);
    1671         412 :                 if (dst) {
    1672         824 :                         if (inet_sk(sk)->rx_dst_ifindex != skb->skb_iif ||
    1673         412 :                             !INDIRECT_CALL_1(dst->ops->check, ipv4_dst_check,
    1674             :                                              dst, 0)) {
    1675           0 :                                 dst_release(dst);
    1676           0 :                                 sk->sk_rx_dst = NULL;
    1677             :                         }
    1678             :                 }
    1679         412 :                 tcp_rcv_established(sk, skb);
    1680         412 :                 return 0;
    1681             :         }
    1682             : 
    1683           7 :         if (tcp_checksum_complete(skb))
    1684           0 :                 goto csum_err;
    1685             : 
    1686           7 :         if (sk->sk_state == TCP_LISTEN) {
    1687           4 :                 struct sock *nsk = tcp_v4_cookie_check(sk, skb);
    1688             : 
    1689           4 :                 if (!nsk)
    1690           0 :                         goto discard;
    1691             :                 if (nsk != sk) {
    1692             :                         if (tcp_child_process(sk, nsk, skb)) {
    1693             :                                 rsk = nsk;
    1694             :                                 goto reset;
    1695             :                         }
    1696             :                         return 0;
    1697             :                 }
    1698             :         } else
    1699           3 :                 sock_rps_save_rxhash(sk, skb);
    1700             : 
    1701           7 :         if (tcp_rcv_state_process(sk, skb)) {
    1702           0 :                 rsk = sk;
    1703           0 :                 goto reset;
    1704             :         }
    1705             :         return 0;
    1706             : 
    1707           0 : reset:
    1708           0 :         tcp_v4_send_reset(rsk, skb);
    1709           0 : discard:
    1710           0 :         kfree_skb(skb);
    1711             :         /* Be careful here. If this function gets more complicated and
    1712             :          * gcc suffers from register pressure on the x86, sk (in %ebx)
    1713             :          * might be destroyed here. This current version compiles correctly,
    1714             :          * but you have been warned.
    1715             :          */
    1716           0 :         return 0;
    1717             : 
    1718           0 : csum_err:
    1719           0 :         TCP_INC_STATS(sock_net(sk), TCP_MIB_CSUMERRORS);
    1720           0 :         TCP_INC_STATS(sock_net(sk), TCP_MIB_INERRS);
    1721           0 :         goto discard;
    1722             : }
    1723             : EXPORT_SYMBOL(tcp_v4_do_rcv);
    1724             : 
    1725         389 : int tcp_v4_early_demux(struct sk_buff *skb)
    1726             : {
    1727         389 :         const struct iphdr *iph;
    1728         389 :         const struct tcphdr *th;
    1729         389 :         struct sock *sk;
    1730             : 
    1731         389 :         if (skb->pkt_type != PACKET_HOST)
    1732             :                 return 0;
    1733             : 
    1734         389 :         if (!pskb_may_pull(skb, skb_transport_offset(skb) + sizeof(struct tcphdr)))
    1735             :                 return 0;
    1736             : 
    1737         389 :         iph = ip_hdr(skb);
    1738         389 :         th = tcp_hdr(skb);
    1739             : 
    1740         389 :         if (th->doff < sizeof(struct tcphdr) / 4)
    1741             :                 return 0;
    1742             : 
    1743         389 :         sk = __inet_lookup_established(dev_net(skb->dev), &tcp_hashinfo,
    1744         389 :                                        iph->saddr, th->source,
    1745         389 :                                        iph->daddr, ntohs(th->dest),
    1746             :                                        skb->skb_iif, inet_sdif(skb));
    1747         389 :         if (sk) {
    1748         385 :                 skb->sk = sk;
    1749         385 :                 skb->destructor = sock_edemux;
    1750         385 :                 if (sk_fullsock(sk)) {
    1751         381 :                         struct dst_entry *dst = READ_ONCE(sk->sk_rx_dst);
    1752             : 
    1753         381 :                         if (dst)
    1754         381 :                                 dst = dst_check(dst, 0);
    1755         381 :                         if (dst &&
    1756         381 :                             inet_sk(sk)->rx_dst_ifindex == skb->skb_iif)
    1757         381 :                                 skb_dst_set_noref(skb, dst);
    1758             :                 }
    1759             :         }
    1760             :         return 0;
    1761             : }
    1762             : 
    1763         157 : bool tcp_add_backlog(struct sock *sk, struct sk_buff *skb)
    1764             : {
    1765         157 :         u32 limit = READ_ONCE(sk->sk_rcvbuf) + READ_ONCE(sk->sk_sndbuf);
    1766         157 :         u32 tail_gso_size, tail_gso_segs;
    1767         157 :         struct skb_shared_info *shinfo;
    1768         157 :         const struct tcphdr *th;
    1769         157 :         struct tcphdr *thtail;
    1770         157 :         struct sk_buff *tail;
    1771         157 :         unsigned int hdrlen;
    1772         157 :         bool fragstolen;
    1773         157 :         u32 gso_segs;
    1774         157 :         u32 gso_size;
    1775         157 :         int delta;
    1776             : 
    1777             :         /* In case all data was pulled from skb frags (in __pskb_pull_tail()),
    1778             :          * we can fix skb->truesize to its real value to avoid future drops.
    1779             :          * This is valid because skb is not yet charged to the socket.
    1780             :          * It has been noticed pure SACK packets were sometimes dropped
    1781             :          * (if cooked by drivers without copybreak feature).
    1782             :          */
    1783         157 :         skb_condense(skb);
    1784             : 
    1785         157 :         skb_dst_drop(skb);
    1786             : 
    1787         157 :         if (unlikely(tcp_checksum_complete(skb))) {
    1788           0 :                 bh_unlock_sock(sk);
    1789           0 :                 __TCP_INC_STATS(sock_net(sk), TCP_MIB_CSUMERRORS);
    1790           0 :                 __TCP_INC_STATS(sock_net(sk), TCP_MIB_INERRS);
    1791           0 :                 return true;
    1792             :         }
    1793             : 
    1794             :         /* Attempt coalescing to last skb in backlog, even if we are
    1795             :          * above the limits.
    1796             :          * This is okay because skb capacity is limited to MAX_SKB_FRAGS.
    1797             :          */
    1798         157 :         th = (const struct tcphdr *)skb->data;
    1799         157 :         hdrlen = th->doff * 4;
    1800             : 
    1801         157 :         tail = sk->sk_backlog.tail;
    1802         157 :         if (!tail)
    1803         141 :                 goto no_coalesce;
    1804          16 :         thtail = (struct tcphdr *)tail->data;
    1805             : 
    1806          16 :         if (TCP_SKB_CB(tail)->end_seq != TCP_SKB_CB(skb)->seq ||
    1807          16 :             TCP_SKB_CB(tail)->ip_dsfield != TCP_SKB_CB(skb)->ip_dsfield ||
    1808          16 :             ((TCP_SKB_CB(tail)->tcp_flags |
    1809          16 :               TCP_SKB_CB(skb)->tcp_flags) & (TCPHDR_SYN | TCPHDR_RST | TCPHDR_URG)) ||
    1810             :             !((TCP_SKB_CB(tail)->tcp_flags &
    1811          16 :               TCP_SKB_CB(skb)->tcp_flags) & TCPHDR_ACK) ||
    1812             :             ((TCP_SKB_CB(tail)->tcp_flags ^
    1813          16 :               TCP_SKB_CB(skb)->tcp_flags) & (TCPHDR_ECE | TCPHDR_CWR)) ||
    1814             : #ifdef CONFIG_TLS_DEVICE
    1815             :             tail->decrypted != skb->decrypted ||
    1816             : #endif
    1817          16 :             thtail->doff != th->doff ||
    1818          16 :             memcmp(thtail + 1, th + 1, hdrlen - sizeof(*th)))
    1819           0 :                 goto no_coalesce;
    1820             : 
    1821          16 :         __skb_pull(skb, hdrlen);
    1822             : 
    1823          16 :         shinfo = skb_shinfo(skb);
    1824          16 :         gso_size = shinfo->gso_size ?: skb->len;
    1825          16 :         gso_segs = shinfo->gso_segs ?: 1;
    1826             : 
    1827          16 :         shinfo = skb_shinfo(tail);
    1828          16 :         tail_gso_size = shinfo->gso_size ?: (tail->len - hdrlen);
    1829          16 :         tail_gso_segs = shinfo->gso_segs ?: 1;
    1830             : 
    1831          16 :         if (skb_try_coalesce(tail, skb, &fragstolen, &delta)) {
    1832          15 :                 TCP_SKB_CB(tail)->end_seq = TCP_SKB_CB(skb)->end_seq;
    1833             : 
    1834          15 :                 if (likely(!before(TCP_SKB_CB(skb)->ack_seq, TCP_SKB_CB(tail)->ack_seq))) {
    1835          15 :                         TCP_SKB_CB(tail)->ack_seq = TCP_SKB_CB(skb)->ack_seq;
    1836          15 :                         thtail->window = th->window;
    1837             :                 }
    1838             : 
    1839             :                 /* We have to update both TCP_SKB_CB(tail)->tcp_flags and
    1840             :                  * thtail->fin, so that the fast path in tcp_rcv_established()
    1841             :                  * is not entered if we append a packet with a FIN.
    1842             :                  * SYN, RST, URG are not present.
    1843             :                  * ACK is set on both packets.
    1844             :                  * PSH : we do not really care in TCP stack,
    1845             :                  *       at least for 'GRO' packets.
    1846             :                  */
    1847          15 :                 thtail->fin |= th->fin;
    1848          15 :                 TCP_SKB_CB(tail)->tcp_flags |= TCP_SKB_CB(skb)->tcp_flags;
    1849             : 
    1850          15 :                 if (TCP_SKB_CB(skb)->has_rxtstamp) {
    1851           0 :                         TCP_SKB_CB(tail)->has_rxtstamp = true;
    1852           0 :                         tail->tstamp = skb->tstamp;
    1853           0 :                         skb_hwtstamps(tail)->hwtstamp = skb_hwtstamps(skb)->hwtstamp;
    1854             :                 }
    1855             : 
    1856             :                 /* Not as strict as GRO. We only need to carry mss max value */
    1857          15 :                 shinfo->gso_size = max(gso_size, tail_gso_size);
    1858          15 :                 shinfo->gso_segs = min_t(u32, gso_segs + tail_gso_segs, 0xFFFF);
    1859             : 
    1860          15 :                 sk->sk_backlog.len += delta;
    1861          15 :                 __NET_INC_STATS(sock_net(sk),
    1862             :                                 LINUX_MIB_TCPBACKLOGCOALESCE);
    1863          15 :                 kfree_skb_partial(skb, fragstolen);
    1864          15 :                 return false;
    1865             :         }
    1866           1 :         __skb_push(skb, hdrlen);
    1867             : 
    1868         142 : no_coalesce:
    1869             :         /* Only socket owner can try to collapse/prune rx queues
    1870             :          * to reduce memory overhead, so add a little headroom here.
    1871             :          * Few sockets backlog are possibly concurrently non empty.
    1872             :          */
    1873         142 :         limit += 64*1024;
    1874             : 
    1875         142 :         if (unlikely(sk_add_backlog(sk, skb, limit))) {
    1876           0 :                 bh_unlock_sock(sk);
    1877           0 :                 __NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPBACKLOGDROP);
    1878           0 :                 return true;
    1879             :         }
    1880             :         return false;
    1881             : }
    1882             : EXPORT_SYMBOL(tcp_add_backlog);
    1883             : 
    1884         438 : int tcp_filter(struct sock *sk, struct sk_buff *skb)
    1885             : {
    1886         438 :         struct tcphdr *th = (struct tcphdr *)skb->data;
    1887             : 
    1888         438 :         return sk_filter_trim_cap(sk, skb, th->doff * 4);
    1889             : }
    1890             : EXPORT_SYMBOL(tcp_filter);
    1891             : 
    1892           0 : static void tcp_v4_restore_cb(struct sk_buff *skb)
    1893             : {
    1894           0 :         memmove(IPCB(skb), &TCP_SKB_CB(skb)->header.h4,
    1895             :                 sizeof(struct inet_skb_parm));
    1896             : }
    1897             : 
    1898         438 : static void tcp_v4_fill_cb(struct sk_buff *skb, const struct iphdr *iph,
    1899             :                            const struct tcphdr *th)
    1900             : {
    1901             :         /* This is tricky : We move IPCB at its correct location into TCP_SKB_CB()
    1902             :          * barrier() makes sure compiler wont play fool^Waliasing games.
    1903             :          */
    1904         438 :         memmove(&TCP_SKB_CB(skb)->header.h4, IPCB(skb),
    1905             :                 sizeof(struct inet_skb_parm));
    1906         438 :         barrier();
    1907             : 
    1908         438 :         TCP_SKB_CB(skb)->seq = ntohl(th->seq);
    1909         438 :         TCP_SKB_CB(skb)->end_seq = (TCP_SKB_CB(skb)->seq + th->syn + th->fin +
    1910         438 :                                     skb->len - th->doff * 4);
    1911         438 :         TCP_SKB_CB(skb)->ack_seq = ntohl(th->ack_seq);
    1912         438 :         TCP_SKB_CB(skb)->tcp_flags = tcp_flag_byte(th);
    1913         438 :         TCP_SKB_CB(skb)->tcp_tw_isn = 0;
    1914         438 :         TCP_SKB_CB(skb)->ip_dsfield = ipv4_get_dsfield(iph);
    1915         438 :         TCP_SKB_CB(skb)->sacked       = 0;
    1916         876 :         TCP_SKB_CB(skb)->has_rxtstamp =
    1917         438 :                         skb->tstamp || skb_hwtstamps(skb)->hwtstamp;
    1918         438 : }
    1919             : 
    1920             : /*
    1921             :  *      From tcp_input.c
    1922             :  */
    1923             : 
    1924         438 : int tcp_v4_rcv(struct sk_buff *skb)
    1925             : {
    1926         438 :         struct net *net = dev_net(skb->dev);
    1927         438 :         struct sk_buff *skb_to_free;
    1928         438 :         int sdif = inet_sdif(skb);
    1929         438 :         int dif = inet_iif(skb);
    1930         438 :         const struct iphdr *iph;
    1931         438 :         const struct tcphdr *th;
    1932         438 :         bool refcounted;
    1933         438 :         struct sock *sk;
    1934         438 :         int ret;
    1935             : 
    1936         438 :         if (skb->pkt_type != PACKET_HOST)
    1937           0 :                 goto discard_it;
    1938             : 
    1939             :         /* Count it even if it's bad */
    1940         438 :         __TCP_INC_STATS(net, TCP_MIB_INSEGS);
    1941             : 
    1942         438 :         if (!pskb_may_pull(skb, sizeof(struct tcphdr)))
    1943           0 :                 goto discard_it;
    1944             : 
    1945         438 :         th = (const struct tcphdr *)skb->data;
    1946             : 
    1947         438 :         if (unlikely(th->doff < sizeof(struct tcphdr) / 4))
    1948           0 :                 goto bad_packet;
    1949         438 :         if (!pskb_may_pull(skb, th->doff * 4))
    1950           0 :                 goto discard_it;
    1951             : 
    1952             :         /* An explanation is required here, I think.
    1953             :          * Packet length and doff are validated by header prediction,
    1954             :          * provided case of th->doff==0 is eliminated.
    1955             :          * So, we defer the checks. */
    1956             : 
    1957         438 :         if (skb_checksum_init(skb, IPPROTO_TCP, inet_compute_pseudo))
    1958           0 :                 goto csum_error;
    1959             : 
    1960         438 :         th = (const struct tcphdr *)skb->data;
    1961         438 :         iph = ip_hdr(skb);
    1962         438 : lookup:
    1963         876 :         sk = __inet_lookup_skb(&tcp_hashinfo, skb, __tcp_hdrlen(th), th->source,
    1964         438 :                                th->dest, sdif, &refcounted);
    1965         438 :         if (!sk)
    1966           0 :                 goto no_tcp_socket;
    1967             : 
    1968         438 : process:
    1969         438 :         if (sk->sk_state == TCP_TIME_WAIT)
    1970           0 :                 goto do_time_wait;
    1971             : 
    1972         438 :         if (sk->sk_state == TCP_NEW_SYN_RECV) {
    1973           4 :                 struct request_sock *req = inet_reqsk(sk);
    1974           4 :                 bool req_stolen = false;
    1975           4 :                 struct sock *nsk;
    1976             : 
    1977           4 :                 sk = req->rsk_listener;
    1978           4 :                 if (unlikely(tcp_v4_inbound_md5_hash(sk, skb, dif, sdif))) {
    1979             :                         sk_drops_add(sk, skb);
    1980             :                         reqsk_put(req);
    1981             :                         goto discard_it;
    1982             :                 }
    1983           4 :                 if (tcp_checksum_complete(skb)) {
    1984           0 :                         reqsk_put(req);
    1985           0 :                         goto csum_error;
    1986             :                 }
    1987           4 :                 if (unlikely(sk->sk_state != TCP_LISTEN)) {
    1988           0 :                         inet_csk_reqsk_queue_drop_and_put(sk, req);
    1989           0 :                         goto lookup;
    1990             :                 }
    1991             :                 /* We own a reference on the listener, increase it again
    1992             :                  * as we might lose it too soon.
    1993             :                  */
    1994           4 :                 sock_hold(sk);
    1995           4 :                 refcounted = true;
    1996           4 :                 nsk = NULL;
    1997           4 :                 if (!tcp_filter(sk, skb)) {
    1998           4 :                         th = (const struct tcphdr *)skb->data;
    1999           4 :                         iph = ip_hdr(skb);
    2000           4 :                         tcp_v4_fill_cb(skb, iph, th);
    2001           4 :                         nsk = tcp_check_req(sk, skb, req, false, &req_stolen);
    2002             :                 }
    2003           4 :                 if (!nsk) {
    2004           0 :                         reqsk_put(req);
    2005           0 :                         if (req_stolen) {
    2006             :                                 /* Another cpu got exclusive access to req
    2007             :                                  * and created a full blown socket.
    2008             :                                  * Try to feed this packet to this socket
    2009             :                                  * instead of discarding it.
    2010             :                                  */
    2011           0 :                                 tcp_v4_restore_cb(skb);
    2012           0 :                                 sock_put(sk);
    2013           0 :                                 goto lookup;
    2014             :                         }
    2015           0 :                         goto discard_and_relse;
    2016             :                 }
    2017           4 :                 if (nsk == sk) {
    2018           0 :                         reqsk_put(req);
    2019           0 :                         tcp_v4_restore_cb(skb);
    2020           4 :                 } else if (tcp_child_process(sk, nsk, skb)) {
    2021           0 :                         tcp_v4_send_reset(nsk, skb);
    2022           0 :                         goto discard_and_relse;
    2023             :                 } else {
    2024           4 :                         sock_put(sk);
    2025           4 :                         return 0;
    2026             :                 }
    2027             :         }
    2028         434 :         if (unlikely(iph->ttl < inet_sk(sk)->min_ttl)) {
    2029           0 :                 __NET_INC_STATS(net, LINUX_MIB_TCPMINTTLDROP);
    2030           0 :                 goto discard_and_relse;
    2031             :         }
    2032             : 
    2033         434 :         if (!xfrm4_policy_check(sk, XFRM_POLICY_IN, skb))
    2034             :                 goto discard_and_relse;
    2035             : 
    2036         434 :         if (tcp_v4_inbound_md5_hash(sk, skb, dif, sdif))
    2037             :                 goto discard_and_relse;
    2038             : 
    2039         434 :         nf_reset_ct(skb);
    2040             : 
    2041         434 :         if (tcp_filter(sk, skb))
    2042           0 :                 goto discard_and_relse;
    2043         434 :         th = (const struct tcphdr *)skb->data;
    2044         434 :         iph = ip_hdr(skb);
    2045         434 :         tcp_v4_fill_cb(skb, iph, th);
    2046             : 
    2047         434 :         skb->dev = NULL;
    2048             : 
    2049         434 :         if (sk->sk_state == TCP_LISTEN) {
    2050           4 :                 ret = tcp_v4_do_rcv(sk, skb);
    2051           4 :                 goto put_and_return;
    2052             :         }
    2053             : 
    2054         430 :         sk_incoming_cpu_update(sk);
    2055             : 
    2056         430 :         bh_lock_sock_nested(sk);
    2057         430 :         tcp_segs_in(tcp_sk(sk), skb);
    2058         430 :         ret = 0;
    2059         430 :         if (!sock_owned_by_user(sk)) {
    2060         273 :                 skb_to_free = sk->sk_rx_skb_cache;
    2061         273 :                 sk->sk_rx_skb_cache = NULL;
    2062         273 :                 ret = tcp_v4_do_rcv(sk, skb);
    2063             :         } else {
    2064         157 :                 if (tcp_add_backlog(sk, skb))
    2065           0 :                         goto discard_and_relse;
    2066             :                 skb_to_free = NULL;
    2067             :         }
    2068         430 :         bh_unlock_sock(sk);
    2069         430 :         if (skb_to_free)
    2070           0 :                 __kfree_skb(skb_to_free);
    2071             : 
    2072         430 : put_and_return:
    2073         434 :         if (refcounted)
    2074         430 :                 sock_put(sk);
    2075             : 
    2076             :         return ret;
    2077             : 
    2078           0 : no_tcp_socket:
    2079           0 :         if (!xfrm4_policy_check(NULL, XFRM_POLICY_IN, skb))
    2080             :                 goto discard_it;
    2081             : 
    2082           0 :         tcp_v4_fill_cb(skb, iph, th);
    2083             : 
    2084           0 :         if (tcp_checksum_complete(skb)) {
    2085           0 : csum_error:
    2086           0 :                 __TCP_INC_STATS(net, TCP_MIB_CSUMERRORS);
    2087           0 : bad_packet:
    2088           0 :                 __TCP_INC_STATS(net, TCP_MIB_INERRS);
    2089             :         } else {
    2090           0 :                 tcp_v4_send_reset(NULL, skb);
    2091             :         }
    2092             : 
    2093           0 : discard_it:
    2094             :         /* Discard frame. */
    2095           0 :         kfree_skb(skb);
    2096           0 :         return 0;
    2097             : 
    2098           0 : discard_and_relse:
    2099           0 :         sk_drops_add(sk, skb);
    2100           0 :         if (refcounted)
    2101           0 :                 sock_put(sk);
    2102           0 :         goto discard_it;
    2103             : 
    2104           0 : do_time_wait:
    2105           0 :         if (!xfrm4_policy_check(NULL, XFRM_POLICY_IN, skb)) {
    2106             :                 inet_twsk_put(inet_twsk(sk));
    2107             :                 goto discard_it;
    2108             :         }
    2109             : 
    2110           0 :         tcp_v4_fill_cb(skb, iph, th);
    2111             : 
    2112           0 :         if (tcp_checksum_complete(skb)) {
    2113           0 :                 inet_twsk_put(inet_twsk(sk));
    2114           0 :                 goto csum_error;
    2115             :         }
    2116           0 :         switch (tcp_timewait_state_process(inet_twsk(sk), skb, th)) {
    2117           0 :         case TCP_TW_SYN: {
    2118           0 :                 struct sock *sk2 = inet_lookup_listener(dev_net(skb->dev),
    2119             :                                                         &tcp_hashinfo, skb,
    2120           0 :                                                         __tcp_hdrlen(th),
    2121           0 :                                                         iph->saddr, th->source,
    2122           0 :                                                         iph->daddr, th->dest,
    2123             :                                                         inet_iif(skb),
    2124             :                                                         sdif);
    2125           0 :                 if (sk2) {
    2126           0 :                         inet_twsk_deschedule_put(inet_twsk(sk));
    2127           0 :                         sk = sk2;
    2128           0 :                         tcp_v4_restore_cb(skb);
    2129           0 :                         refcounted = false;
    2130           0 :                         goto process;
    2131             :                 }
    2132             :         }
    2133             :                 /* to ACK */
    2134           0 :                 fallthrough;
    2135             :         case TCP_TW_ACK:
    2136           0 :                 tcp_v4_timewait_ack(sk, skb);
    2137           0 :                 break;
    2138           0 :         case TCP_TW_RST:
    2139           0 :                 tcp_v4_send_reset(sk, skb);
    2140           0 :                 inet_twsk_deschedule_put(inet_twsk(sk));
    2141           0 :                 goto discard_it;
    2142           0 :         case TCP_TW_SUCCESS:;
    2143             :         }
    2144           0 :         goto discard_it;
    2145             : }
    2146             : 
    2147             : static struct timewait_sock_ops tcp_timewait_sock_ops = {
    2148             :         .twsk_obj_size  = sizeof(struct tcp_timewait_sock),
    2149             :         .twsk_unique    = tcp_twsk_unique,
    2150             :         .twsk_destructor= tcp_twsk_destructor,
    2151             : };
    2152             : 
    2153           4 : void inet_sk_rx_dst_set(struct sock *sk, const struct sk_buff *skb)
    2154             : {
    2155           4 :         struct dst_entry *dst = skb_dst(skb);
    2156             : 
    2157           4 :         if (dst && dst_hold_safe(dst)) {
    2158           4 :                 sk->sk_rx_dst = dst;
    2159           4 :                 inet_sk(sk)->rx_dst_ifindex = skb->skb_iif;
    2160             :         }
    2161           4 : }
    2162             : EXPORT_SYMBOL(inet_sk_rx_dst_set);
    2163             : 
    2164             : const struct inet_connection_sock_af_ops ipv4_specific = {
    2165             :         .queue_xmit        = ip_queue_xmit,
    2166             :         .send_check        = tcp_v4_send_check,
    2167             :         .rebuild_header    = inet_sk_rebuild_header,
    2168             :         .sk_rx_dst_set     = inet_sk_rx_dst_set,
    2169             :         .conn_request      = tcp_v4_conn_request,
    2170             :         .syn_recv_sock     = tcp_v4_syn_recv_sock,
    2171             :         .net_header_len    = sizeof(struct iphdr),
    2172             :         .setsockopt        = ip_setsockopt,
    2173             :         .getsockopt        = ip_getsockopt,
    2174             :         .addr2sockaddr     = inet_csk_addr2sockaddr,
    2175             :         .sockaddr_len      = sizeof(struct sockaddr_in),
    2176             :         .mtu_reduced       = tcp_v4_mtu_reduced,
    2177             : };
    2178             : EXPORT_SYMBOL(ipv4_specific);
    2179             : 
    2180             : #ifdef CONFIG_TCP_MD5SIG
    2181             : static const struct tcp_sock_af_ops tcp_sock_ipv4_specific = {
    2182             :         .md5_lookup             = tcp_v4_md5_lookup,
    2183             :         .calc_md5_hash          = tcp_v4_md5_hash_skb,
    2184             :         .md5_parse              = tcp_v4_parse_md5_keys,
    2185             : };
    2186             : #endif
    2187             : 
    2188             : /* NOTE: A lot of things set to zero explicitly by call to
    2189             :  *       sk_alloc() so need not be done here.
    2190             :  */
    2191           3 : static int tcp_v4_init_sock(struct sock *sk)
    2192             : {
    2193           3 :         struct inet_connection_sock *icsk = inet_csk(sk);
    2194             : 
    2195           3 :         tcp_init_sock(sk);
    2196             : 
    2197           3 :         icsk->icsk_af_ops = &ipv4_specific;
    2198             : 
    2199             : #ifdef CONFIG_TCP_MD5SIG
    2200             :         tcp_sk(sk)->af_specific = &tcp_sock_ipv4_specific;
    2201             : #endif
    2202             : 
    2203           3 :         return 0;
    2204             : }
    2205             : 
    2206           4 : void tcp_v4_destroy_sock(struct sock *sk)
    2207             : {
    2208           4 :         struct tcp_sock *tp = tcp_sk(sk);
    2209             : 
    2210           4 :         trace_tcp_destroy_sock(sk);
    2211             : 
    2212           4 :         tcp_clear_xmit_timers(sk);
    2213             : 
    2214           4 :         tcp_cleanup_congestion_control(sk);
    2215             : 
    2216           4 :         tcp_cleanup_ulp(sk);
    2217             : 
    2218             :         /* Cleanup up the write buffer. */
    2219           4 :         tcp_write_queue_purge(sk);
    2220             : 
    2221             :         /* Check if we want to disable active TFO */
    2222           4 :         tcp_fastopen_active_disable_ofo_check(sk);
    2223             : 
    2224             :         /* Cleans up our, hopefully empty, out_of_order_queue. */
    2225           4 :         skb_rbtree_purge(&tp->out_of_order_queue);
    2226             : 
    2227             : #ifdef CONFIG_TCP_MD5SIG
    2228             :         /* Clean up the MD5 key list, if any */
    2229             :         if (tp->md5sig_info) {
    2230             :                 tcp_clear_md5_list(sk);
    2231             :                 kfree_rcu(rcu_dereference_protected(tp->md5sig_info, 1), rcu);
    2232             :                 tp->md5sig_info = NULL;
    2233             :         }
    2234             : #endif
    2235             : 
    2236             :         /* Clean up a referenced TCP bind bucket. */
    2237           4 :         if (inet_csk(sk)->icsk_bind_hash)
    2238           1 :                 inet_put_port(sk);
    2239             : 
    2240           4 :         BUG_ON(rcu_access_pointer(tp->fastopen_rsk));
    2241             : 
    2242             :         /* If socket is aborted during connect operation */
    2243           4 :         tcp_free_fastopen_req(tp);
    2244           4 :         tcp_fastopen_destroy_cipher(sk);
    2245           4 :         tcp_saved_syn_free(tp);
    2246             : 
    2247           4 :         sk_sockets_allocated_dec(sk);
    2248           4 : }
    2249             : EXPORT_SYMBOL(tcp_v4_destroy_sock);
    2250             : 
    2251             : #ifdef CONFIG_PROC_FS
    2252             : /* Proc filesystem TCP sock list dumping. */
    2253             : 
    2254             : /*
    2255             :  * Get next listener socket follow cur.  If cur is NULL, get first socket
    2256             :  * starting from bucket given in st->bucket; when st->bucket is zero the
    2257             :  * very first socket in the hash table is returned.
    2258             :  */
    2259           0 : static void *listening_get_next(struct seq_file *seq, void *cur)
    2260             : {
    2261           0 :         struct tcp_seq_afinfo *afinfo;
    2262           0 :         struct tcp_iter_state *st = seq->private;
    2263           0 :         struct net *net = seq_file_net(seq);
    2264           0 :         struct inet_listen_hashbucket *ilb;
    2265           0 :         struct hlist_nulls_node *node;
    2266           0 :         struct sock *sk = cur;
    2267             : 
    2268           0 :         if (st->bpf_seq_afinfo)
    2269             :                 afinfo = st->bpf_seq_afinfo;
    2270             :         else
    2271           0 :                 afinfo = PDE_DATA(file_inode(seq->file));
    2272             : 
    2273           0 :         if (!sk) {
    2274           0 : get_head:
    2275           0 :                 ilb = &tcp_hashinfo.listening_hash[st->bucket];
    2276           0 :                 spin_lock(&ilb->lock);
    2277           0 :                 sk = sk_nulls_head(&ilb->nulls_head);
    2278           0 :                 st->offset = 0;
    2279           0 :                 goto get_sk;
    2280             :         }
    2281           0 :         ilb = &tcp_hashinfo.listening_hash[st->bucket];
    2282           0 :         ++st->num;
    2283           0 :         ++st->offset;
    2284             : 
    2285           0 :         sk = sk_nulls_next(sk);
    2286           0 : get_sk:
    2287           0 :         sk_nulls_for_each_from(sk, node) {
    2288           0 :                 if (!net_eq(sock_net(sk), net))
    2289             :                         continue;
    2290           0 :                 if (afinfo->family == AF_UNSPEC ||
    2291           0 :                     sk->sk_family == afinfo->family)
    2292           0 :                         return sk;
    2293             :         }
    2294           0 :         spin_unlock(&ilb->lock);
    2295           0 :         st->offset = 0;
    2296           0 :         if (++st->bucket < INET_LHTABLE_SIZE)
    2297           0 :                 goto get_head;
    2298             :         return NULL;
    2299             : }
    2300             : 
    2301           0 : static void *listening_get_idx(struct seq_file *seq, loff_t *pos)
    2302             : {
    2303           0 :         struct tcp_iter_state *st = seq->private;
    2304           0 :         void *rc;
    2305             : 
    2306           0 :         st->bucket = 0;
    2307           0 :         st->offset = 0;
    2308           0 :         rc = listening_get_next(seq, NULL);
    2309             : 
    2310           0 :         while (rc && *pos) {
    2311           0 :                 rc = listening_get_next(seq, rc);
    2312           0 :                 --*pos;
    2313             :         }
    2314           0 :         return rc;
    2315             : }
    2316             : 
    2317           0 : static inline bool empty_bucket(const struct tcp_iter_state *st)
    2318             : {
    2319           0 :         return hlist_nulls_empty(&tcp_hashinfo.ehash[st->bucket].chain);
    2320             : }
    2321             : 
    2322             : /*
    2323             :  * Get first established socket starting from bucket given in st->bucket.
    2324             :  * If st->bucket is zero, the very first socket in the hash is returned.
    2325             :  */
    2326           0 : static void *established_get_first(struct seq_file *seq)
    2327             : {
    2328           0 :         struct tcp_seq_afinfo *afinfo;
    2329           0 :         struct tcp_iter_state *st = seq->private;
    2330           0 :         struct net *net = seq_file_net(seq);
    2331           0 :         void *rc = NULL;
    2332             : 
    2333           0 :         if (st->bpf_seq_afinfo)
    2334             :                 afinfo = st->bpf_seq_afinfo;
    2335             :         else
    2336           0 :                 afinfo = PDE_DATA(file_inode(seq->file));
    2337             : 
    2338           0 :         st->offset = 0;
    2339           0 :         for (; st->bucket <= tcp_hashinfo.ehash_mask; ++st->bucket) {
    2340           0 :                 struct sock *sk;
    2341           0 :                 struct hlist_nulls_node *node;
    2342           0 :                 spinlock_t *lock = inet_ehash_lockp(&tcp_hashinfo, st->bucket);
    2343             : 
    2344             :                 /* Lockless fast path for the common case of empty buckets */
    2345           0 :                 if (empty_bucket(st))
    2346           0 :                         continue;
    2347             : 
    2348           0 :                 spin_lock_bh(lock);
    2349           0 :                 sk_nulls_for_each(sk, node, &tcp_hashinfo.ehash[st->bucket].chain) {
    2350           0 :                         if ((afinfo->family != AF_UNSPEC &&
    2351           0 :                              sk->sk_family != afinfo->family) ||
    2352           0 :                             !net_eq(sock_net(sk), net)) {
    2353           0 :                                 continue;
    2354             :                         }
    2355           0 :                         rc = sk;
    2356           0 :                         goto out;
    2357             :                 }
    2358           0 :                 spin_unlock_bh(lock);
    2359             :         }
    2360           0 : out:
    2361           0 :         return rc;
    2362             : }
    2363             : 
    2364           0 : static void *established_get_next(struct seq_file *seq, void *cur)
    2365             : {
    2366           0 :         struct tcp_seq_afinfo *afinfo;
    2367           0 :         struct sock *sk = cur;
    2368           0 :         struct hlist_nulls_node *node;
    2369           0 :         struct tcp_iter_state *st = seq->private;
    2370           0 :         struct net *net = seq_file_net(seq);
    2371             : 
    2372           0 :         if (st->bpf_seq_afinfo)
    2373             :                 afinfo = st->bpf_seq_afinfo;
    2374             :         else
    2375           0 :                 afinfo = PDE_DATA(file_inode(seq->file));
    2376             : 
    2377           0 :         ++st->num;
    2378           0 :         ++st->offset;
    2379             : 
    2380           0 :         sk = sk_nulls_next(sk);
    2381             : 
    2382           0 :         sk_nulls_for_each_from(sk, node) {
    2383           0 :                 if ((afinfo->family == AF_UNSPEC ||
    2384           0 :                      sk->sk_family == afinfo->family) &&
    2385           0 :                     net_eq(sock_net(sk), net))
    2386           0 :                         return sk;
    2387             :         }
    2388             : 
    2389           0 :         spin_unlock_bh(inet_ehash_lockp(&tcp_hashinfo, st->bucket));
    2390           0 :         ++st->bucket;
    2391           0 :         return established_get_first(seq);
    2392             : }
    2393             : 
    2394           0 : static void *established_get_idx(struct seq_file *seq, loff_t pos)
    2395             : {
    2396           0 :         struct tcp_iter_state *st = seq->private;
    2397           0 :         void *rc;
    2398             : 
    2399           0 :         st->bucket = 0;
    2400           0 :         rc = established_get_first(seq);
    2401             : 
    2402           0 :         while (rc && pos) {
    2403           0 :                 rc = established_get_next(seq, rc);
    2404           0 :                 --pos;
    2405             :         }
    2406           0 :         return rc;
    2407             : }
    2408             : 
    2409           0 : static void *tcp_get_idx(struct seq_file *seq, loff_t pos)
    2410             : {
    2411           0 :         void *rc;
    2412           0 :         struct tcp_iter_state *st = seq->private;
    2413             : 
    2414           0 :         st->state = TCP_SEQ_STATE_LISTENING;
    2415           0 :         rc        = listening_get_idx(seq, &pos);
    2416             : 
    2417           0 :         if (!rc) {
    2418           0 :                 st->state = TCP_SEQ_STATE_ESTABLISHED;
    2419           0 :                 rc        = established_get_idx(seq, pos);
    2420             :         }
    2421             : 
    2422           0 :         return rc;
    2423             : }
    2424             : 
    2425           0 : static void *tcp_seek_last_pos(struct seq_file *seq)
    2426             : {
    2427           0 :         struct tcp_iter_state *st = seq->private;
    2428           0 :         int offset = st->offset;
    2429           0 :         int orig_num = st->num;
    2430           0 :         void *rc = NULL;
    2431             : 
    2432           0 :         switch (st->state) {
    2433           0 :         case TCP_SEQ_STATE_LISTENING:
    2434           0 :                 if (st->bucket >= INET_LHTABLE_SIZE)
    2435             :                         break;
    2436           0 :                 st->state = TCP_SEQ_STATE_LISTENING;
    2437           0 :                 rc = listening_get_next(seq, NULL);
    2438           0 :                 while (offset-- && rc)
    2439           0 :                         rc = listening_get_next(seq, rc);
    2440           0 :                 if (rc)
    2441             :                         break;
    2442           0 :                 st->bucket = 0;
    2443           0 :                 st->state = TCP_SEQ_STATE_ESTABLISHED;
    2444           0 :                 fallthrough;
    2445           0 :         case TCP_SEQ_STATE_ESTABLISHED:
    2446           0 :                 if (st->bucket > tcp_hashinfo.ehash_mask)
    2447             :                         break;
    2448           0 :                 rc = established_get_first(seq);
    2449           0 :                 while (offset-- && rc)
    2450           0 :                         rc = established_get_next(seq, rc);
    2451             :         }
    2452             : 
    2453           0 :         st->num = orig_num;
    2454             : 
    2455           0 :         return rc;
    2456             : }
    2457             : 
    2458           0 : void *tcp_seq_start(struct seq_file *seq, loff_t *pos)
    2459             : {
    2460           0 :         struct tcp_iter_state *st = seq->private;
    2461           0 :         void *rc;
    2462             : 
    2463           0 :         if (*pos && *pos == st->last_pos) {
    2464           0 :                 rc = tcp_seek_last_pos(seq);
    2465           0 :                 if (rc)
    2466           0 :                         goto out;
    2467             :         }
    2468             : 
    2469           0 :         st->state = TCP_SEQ_STATE_LISTENING;
    2470           0 :         st->num = 0;
    2471           0 :         st->bucket = 0;
    2472           0 :         st->offset = 0;
    2473           0 :         rc = *pos ? tcp_get_idx(seq, *pos - 1) : SEQ_START_TOKEN;
    2474             : 
    2475           0 : out:
    2476           0 :         st->last_pos = *pos;
    2477           0 :         return rc;
    2478             : }
    2479             : EXPORT_SYMBOL(tcp_seq_start);
    2480             : 
    2481           0 : void *tcp_seq_next(struct seq_file *seq, void *v, loff_t *pos)
    2482             : {
    2483           0 :         struct tcp_iter_state *st = seq->private;
    2484           0 :         void *rc = NULL;
    2485             : 
    2486           0 :         if (v == SEQ_START_TOKEN) {
    2487           0 :                 rc = tcp_get_idx(seq, 0);
    2488           0 :                 goto out;
    2489             :         }
    2490             : 
    2491           0 :         switch (st->state) {
    2492           0 :         case TCP_SEQ_STATE_LISTENING:
    2493           0 :                 rc = listening_get_next(seq, v);
    2494           0 :                 if (!rc) {
    2495           0 :                         st->state = TCP_SEQ_STATE_ESTABLISHED;
    2496           0 :                         st->bucket = 0;
    2497           0 :                         st->offset = 0;
    2498           0 :                         rc        = established_get_first(seq);
    2499             :                 }
    2500             :                 break;
    2501           0 :         case TCP_SEQ_STATE_ESTABLISHED:
    2502           0 :                 rc = established_get_next(seq, v);
    2503           0 :                 break;
    2504             :         }
    2505           0 : out:
    2506           0 :         ++*pos;
    2507           0 :         st->last_pos = *pos;
    2508           0 :         return rc;
    2509             : }
    2510             : EXPORT_SYMBOL(tcp_seq_next);
    2511             : 
    2512           0 : void tcp_seq_stop(struct seq_file *seq, void *v)
    2513             : {
    2514           0 :         struct tcp_iter_state *st = seq->private;
    2515             : 
    2516           0 :         switch (st->state) {
    2517           0 :         case TCP_SEQ_STATE_LISTENING:
    2518           0 :                 if (v != SEQ_START_TOKEN)
    2519           0 :                         spin_unlock(&tcp_hashinfo.listening_hash[st->bucket].lock);
    2520             :                 break;
    2521           0 :         case TCP_SEQ_STATE_ESTABLISHED:
    2522           0 :                 if (v)
    2523           0 :                         spin_unlock_bh(inet_ehash_lockp(&tcp_hashinfo, st->bucket));
    2524             :                 break;
    2525             :         }
    2526           0 : }
    2527             : EXPORT_SYMBOL(tcp_seq_stop);
    2528             : 
    2529           0 : static void get_openreq4(const struct request_sock *req,
    2530             :                          struct seq_file *f, int i)
    2531             : {
    2532           0 :         const struct inet_request_sock *ireq = inet_rsk(req);
    2533           0 :         long delta = req->rsk_timer.expires - jiffies;
    2534             : 
    2535           0 :         seq_printf(f, "%4d: %08X:%04X %08X:%04X"
    2536             :                 " %02X %08X:%08X %02X:%08lX %08X %5u %8d %u %d %pK",
    2537             :                 i,
    2538             :                 ireq->ir_loc_addr,
    2539           0 :                 ireq->ir_num,
    2540             :                 ireq->ir_rmt_addr,
    2541           0 :                 ntohs(ireq->ir_rmt_port),
    2542             :                 TCP_SYN_RECV,
    2543             :                 0, 0, /* could print option size, but that is af dependent. */
    2544             :                 1,    /* timers active (only the expire timer) */
    2545             :                 jiffies_delta_to_clock_t(delta),
    2546           0 :                 req->num_timeout,
    2547             :                 from_kuid_munged(seq_user_ns(f),
    2548             :                                  sock_i_uid(req->rsk_listener)),
    2549             :                 0,  /* non standard timer */
    2550             :                 0, /* open_requests have no inode */
    2551             :                 0,
    2552             :                 req);
    2553           0 : }
    2554             : 
    2555           0 : static void get_tcp4_sock(struct sock *sk, struct seq_file *f, int i)
    2556             : {
    2557           0 :         int timer_active;
    2558           0 :         unsigned long timer_expires;
    2559           0 :         const struct tcp_sock *tp = tcp_sk(sk);
    2560           0 :         const struct inet_connection_sock *icsk = inet_csk(sk);
    2561           0 :         const struct inet_sock *inet = inet_sk(sk);
    2562           0 :         const struct fastopen_queue *fastopenq = &icsk->icsk_accept_queue.fastopenq;
    2563           0 :         __be32 dest = inet->inet_daddr;
    2564           0 :         __be32 src = inet->inet_rcv_saddr;
    2565           0 :         __u16 destp = ntohs(inet->inet_dport);
    2566           0 :         __u16 srcp = ntohs(inet->inet_sport);
    2567           0 :         int rx_queue;
    2568           0 :         int state;
    2569             : 
    2570           0 :         if (icsk->icsk_pending == ICSK_TIME_RETRANS ||
    2571           0 :             icsk->icsk_pending == ICSK_TIME_REO_TIMEOUT ||
    2572             :             icsk->icsk_pending == ICSK_TIME_LOSS_PROBE) {
    2573           0 :                 timer_active    = 1;
    2574           0 :                 timer_expires   = icsk->icsk_timeout;
    2575           0 :         } else if (icsk->icsk_pending == ICSK_TIME_PROBE0) {
    2576           0 :                 timer_active    = 4;
    2577           0 :                 timer_expires   = icsk->icsk_timeout;
    2578           0 :         } else if (timer_pending(&sk->sk_timer)) {
    2579           0 :                 timer_active    = 2;
    2580           0 :                 timer_expires   = sk->sk_timer.expires;
    2581             :         } else {
    2582           0 :                 timer_active    = 0;
    2583           0 :                 timer_expires = jiffies;
    2584             :         }
    2585             : 
    2586           0 :         state = inet_sk_state_load(sk);
    2587           0 :         if (state == TCP_LISTEN)
    2588           0 :                 rx_queue = READ_ONCE(sk->sk_ack_backlog);
    2589             :         else
    2590             :                 /* Because we don't lock the socket,
    2591             :                  * we might find a transient negative value.
    2592             :                  */
    2593           0 :                 rx_queue = max_t(int, READ_ONCE(tp->rcv_nxt) -
    2594             :                                       READ_ONCE(tp->copied_seq), 0);
    2595             : 
    2596           0 :         seq_printf(f, "%4d: %08X:%04X %08X:%04X %02X %08X:%08X %02X:%08lX "
    2597             :                         "%08X %5u %8d %lu %d %pK %lu %lu %u %u %d",
    2598             :                 i, src, srcp, dest, destp, state,
    2599           0 :                 READ_ONCE(tp->write_seq) - tp->snd_una,
    2600             :                 rx_queue,
    2601             :                 timer_active,
    2602           0 :                 jiffies_delta_to_clock_t(timer_expires - jiffies),
    2603           0 :                 icsk->icsk_retransmits,
    2604             :                 from_kuid_munged(seq_user_ns(f), sock_i_uid(sk)),
    2605           0 :                 icsk->icsk_probes_out,
    2606             :                 sock_i_ino(sk),
    2607           0 :                 refcount_read(&sk->sk_refcnt), sk,
    2608           0 :                 jiffies_to_clock_t(icsk->icsk_rto),
    2609           0 :                 jiffies_to_clock_t(icsk->icsk_ack.ato),
    2610           0 :                 (icsk->icsk_ack.quick << 1) | inet_csk_in_pingpong_mode(sk),
    2611             :                 tp->snd_cwnd,
    2612             :                 state == TCP_LISTEN ?
    2613           0 :                     fastopenq->max_qlen :
    2614           0 :                     (tcp_in_initial_slowstart(tp) ? -1 : tp->snd_ssthresh));
    2615           0 : }
    2616             : 
    2617           0 : static void get_timewait4_sock(const struct inet_timewait_sock *tw,
    2618             :                                struct seq_file *f, int i)
    2619             : {
    2620           0 :         long delta = tw->tw_timer.expires - jiffies;
    2621           0 :         __be32 dest, src;
    2622           0 :         __u16 destp, srcp;
    2623             : 
    2624           0 :         dest  = tw->tw_daddr;
    2625           0 :         src   = tw->tw_rcv_saddr;
    2626           0 :         destp = ntohs(tw->tw_dport);
    2627           0 :         srcp  = ntohs(tw->tw_sport);
    2628             : 
    2629           0 :         seq_printf(f, "%4d: %08X:%04X %08X:%04X"
    2630             :                 " %02X %08X:%08X %02X:%08lX %08X %5d %8d %d %d %pK",
    2631           0 :                 i, src, srcp, dest, destp, tw->tw_substate, 0, 0,
    2632             :                 3, jiffies_delta_to_clock_t(delta), 0, 0, 0, 0,
    2633             :                 refcount_read(&tw->tw_refcnt), tw);
    2634           0 : }
    2635             : 
    2636             : #define TMPSZ 150
    2637             : 
    2638           0 : static int tcp4_seq_show(struct seq_file *seq, void *v)
    2639             : {
    2640           0 :         struct tcp_iter_state *st;
    2641           0 :         struct sock *sk = v;
    2642             : 
    2643           0 :         seq_setwidth(seq, TMPSZ - 1);
    2644           0 :         if (v == SEQ_START_TOKEN) {
    2645           0 :                 seq_puts(seq, "  sl  local_address rem_address   st tx_queue "
    2646             :                            "rx_queue tr tm->when retrnsmt   uid  timeout "
    2647             :                            "inode");
    2648           0 :                 goto out;
    2649             :         }
    2650           0 :         st = seq->private;
    2651             : 
    2652           0 :         if (sk->sk_state == TCP_TIME_WAIT)
    2653           0 :                 get_timewait4_sock(v, seq, st->num);
    2654           0 :         else if (sk->sk_state == TCP_NEW_SYN_RECV)
    2655           0 :                 get_openreq4(v, seq, st->num);
    2656             :         else
    2657           0 :                 get_tcp4_sock(v, seq, st->num);
    2658           0 : out:
    2659           0 :         seq_pad(seq, '\n');
    2660           0 :         return 0;
    2661             : }
    2662             : 
    2663             : #ifdef CONFIG_BPF_SYSCALL
    2664             : struct bpf_iter__tcp {
    2665             :         __bpf_md_ptr(struct bpf_iter_meta *, meta);
    2666             :         __bpf_md_ptr(struct sock_common *, sk_common);
    2667             :         uid_t uid __aligned(8);
    2668             : };
    2669             : 
    2670             : static int tcp_prog_seq_show(struct bpf_prog *prog, struct bpf_iter_meta *meta,
    2671             :                              struct sock_common *sk_common, uid_t uid)
    2672             : {
    2673             :         struct bpf_iter__tcp ctx;
    2674             : 
    2675             :         meta->seq_num--;  /* skip SEQ_START_TOKEN */
    2676             :         ctx.meta = meta;
    2677             :         ctx.sk_common = sk_common;
    2678             :         ctx.uid = uid;
    2679             :         return bpf_iter_run_prog(prog, &ctx);
    2680             : }
    2681             : 
    2682             : static int bpf_iter_tcp_seq_show(struct seq_file *seq, void *v)
    2683             : {
    2684             :         struct bpf_iter_meta meta;
    2685             :         struct bpf_prog *prog;
    2686             :         struct sock *sk = v;
    2687             :         uid_t uid;
    2688             : 
    2689             :         if (v == SEQ_START_TOKEN)
    2690             :                 return 0;
    2691             : 
    2692             :         if (sk->sk_state == TCP_TIME_WAIT) {
    2693             :                 uid = 0;
    2694             :         } else if (sk->sk_state == TCP_NEW_SYN_RECV) {
    2695             :                 const struct request_sock *req = v;
    2696             : 
    2697             :                 uid = from_kuid_munged(seq_user_ns(seq),
    2698             :                                        sock_i_uid(req->rsk_listener));
    2699             :         } else {
    2700             :                 uid = from_kuid_munged(seq_user_ns(seq), sock_i_uid(sk));
    2701             :         }
    2702             : 
    2703             :         meta.seq = seq;
    2704             :         prog = bpf_iter_get_info(&meta, false);
    2705             :         return tcp_prog_seq_show(prog, &meta, v, uid);
    2706             : }
    2707             : 
    2708             : static void bpf_iter_tcp_seq_stop(struct seq_file *seq, void *v)
    2709             : {
    2710             :         struct bpf_iter_meta meta;
    2711             :         struct bpf_prog *prog;
    2712             : 
    2713             :         if (!v) {
    2714             :                 meta.seq = seq;
    2715             :                 prog = bpf_iter_get_info(&meta, true);
    2716             :                 if (prog)
    2717             :                         (void)tcp_prog_seq_show(prog, &meta, v, 0);
    2718             :         }
    2719             : 
    2720             :         tcp_seq_stop(seq, v);
    2721             : }
    2722             : 
    2723             : static const struct seq_operations bpf_iter_tcp_seq_ops = {
    2724             :         .show           = bpf_iter_tcp_seq_show,
    2725             :         .start          = tcp_seq_start,
    2726             :         .next           = tcp_seq_next,
    2727             :         .stop           = bpf_iter_tcp_seq_stop,
    2728             : };
    2729             : #endif
    2730             : 
    2731             : static const struct seq_operations tcp4_seq_ops = {
    2732             :         .show           = tcp4_seq_show,
    2733             :         .start          = tcp_seq_start,
    2734             :         .next           = tcp_seq_next,
    2735             :         .stop           = tcp_seq_stop,
    2736             : };
    2737             : 
    2738             : static struct tcp_seq_afinfo tcp4_seq_afinfo = {
    2739             :         .family         = AF_INET,
    2740             : };
    2741             : 
    2742           1 : static int __net_init tcp4_proc_init_net(struct net *net)
    2743             : {
    2744           1 :         if (!proc_create_net_data("tcp", 0444, net->proc_net, &tcp4_seq_ops,
    2745             :                         sizeof(struct tcp_iter_state), &tcp4_seq_afinfo))
    2746           0 :                 return -ENOMEM;
    2747             :         return 0;
    2748             : }
    2749             : 
    2750           0 : static void __net_exit tcp4_proc_exit_net(struct net *net)
    2751             : {
    2752           0 :         remove_proc_entry("tcp", net->proc_net);
    2753           0 : }
    2754             : 
    2755             : static struct pernet_operations tcp4_net_ops = {
    2756             :         .init = tcp4_proc_init_net,
    2757             :         .exit = tcp4_proc_exit_net,
    2758             : };
    2759             : 
    2760           1 : int __init tcp4_proc_init(void)
    2761             : {
    2762           1 :         return register_pernet_subsys(&tcp4_net_ops);
    2763             : }
    2764             : 
    2765           0 : void tcp4_proc_exit(void)
    2766             : {
    2767           0 :         unregister_pernet_subsys(&tcp4_net_ops);
    2768           0 : }
    2769             : #endif /* CONFIG_PROC_FS */
    2770             : 
    2771             : /* @wake is one when sk_stream_write_space() calls us.
    2772             :  * This sends EPOLLOUT only if notsent_bytes is half the limit.
    2773             :  * This mimics the strategy used in sock_def_write_space().
    2774             :  */
    2775        1046 : bool tcp_stream_memory_free(const struct sock *sk, int wake)
    2776             : {
    2777        1046 :         const struct tcp_sock *tp = tcp_sk(sk);
    2778        1046 :         u32 notsent_bytes = READ_ONCE(tp->write_seq) -
    2779        1046 :                             READ_ONCE(tp->snd_nxt);
    2780             : 
    2781        1046 :         return (notsent_bytes << wake) < tcp_notsent_lowat(tp);
    2782             : }
    2783             : EXPORT_SYMBOL(tcp_stream_memory_free);
    2784             : 
    2785             : struct proto tcp_prot = {
    2786             :         .name                   = "TCP",
    2787             :         .owner                  = THIS_MODULE,
    2788             :         .close                  = tcp_close,
    2789             :         .pre_connect            = tcp_v4_pre_connect,
    2790             :         .connect                = tcp_v4_connect,
    2791             :         .disconnect             = tcp_disconnect,
    2792             :         .accept                 = inet_csk_accept,
    2793             :         .ioctl                  = tcp_ioctl,
    2794             :         .init                   = tcp_v4_init_sock,
    2795             :         .destroy                = tcp_v4_destroy_sock,
    2796             :         .shutdown               = tcp_shutdown,
    2797             :         .setsockopt             = tcp_setsockopt,
    2798             :         .getsockopt             = tcp_getsockopt,
    2799             :         .bpf_bypass_getsockopt  = tcp_bpf_bypass_getsockopt,
    2800             :         .keepalive              = tcp_set_keepalive,
    2801             :         .recvmsg                = tcp_recvmsg,
    2802             :         .sendmsg                = tcp_sendmsg,
    2803             :         .sendpage               = tcp_sendpage,
    2804             :         .backlog_rcv            = tcp_v4_do_rcv,
    2805             :         .release_cb             = tcp_release_cb,
    2806             :         .hash                   = inet_hash,
    2807             :         .unhash                 = inet_unhash,
    2808             :         .get_port               = inet_csk_get_port,
    2809             :         .enter_memory_pressure  = tcp_enter_memory_pressure,
    2810             :         .leave_memory_pressure  = tcp_leave_memory_pressure,
    2811             :         .stream_memory_free     = tcp_stream_memory_free,
    2812             :         .sockets_allocated      = &tcp_sockets_allocated,
    2813             :         .orphan_count           = &tcp_orphan_count,
    2814             :         .memory_allocated       = &tcp_memory_allocated,
    2815             :         .memory_pressure        = &tcp_memory_pressure,
    2816             :         .sysctl_mem             = sysctl_tcp_mem,
    2817             :         .sysctl_wmem_offset     = offsetof(struct net, ipv4.sysctl_tcp_wmem),
    2818             :         .sysctl_rmem_offset     = offsetof(struct net, ipv4.sysctl_tcp_rmem),
    2819             :         .max_header             = MAX_TCP_HEADER,
    2820             :         .obj_size               = sizeof(struct tcp_sock),
    2821             :         .slab_flags             = SLAB_TYPESAFE_BY_RCU,
    2822             :         .twsk_prot              = &tcp_timewait_sock_ops,
    2823             :         .rsk_prot               = &tcp_request_sock_ops,
    2824             :         .h.hashinfo             = &tcp_hashinfo,
    2825             :         .no_autobind            = true,
    2826             :         .diag_destroy           = tcp_abort,
    2827             : };
    2828             : EXPORT_SYMBOL(tcp_prot);
    2829             : 
    2830           0 : static void __net_exit tcp_sk_exit(struct net *net)
    2831             : {
    2832           0 :         int cpu;
    2833             : 
    2834           0 :         if (net->ipv4.tcp_congestion_control)
    2835           0 :                 bpf_module_put(net->ipv4.tcp_congestion_control,
    2836             :                                net->ipv4.tcp_congestion_control->owner);
    2837             : 
    2838           0 :         for_each_possible_cpu(cpu)
    2839           0 :                 inet_ctl_sock_destroy(*per_cpu_ptr(net->ipv4.tcp_sk, cpu));
    2840           0 :         free_percpu(net->ipv4.tcp_sk);
    2841           0 : }
    2842             : 
    2843           1 : static int __net_init tcp_sk_init(struct net *net)
    2844             : {
    2845           1 :         int res, cpu, cnt;
    2846             : 
    2847           1 :         net->ipv4.tcp_sk = alloc_percpu(struct sock *);
    2848           1 :         if (!net->ipv4.tcp_sk)
    2849             :                 return -ENOMEM;
    2850             : 
    2851           5 :         for_each_possible_cpu(cpu) {
    2852           4 :                 struct sock *sk;
    2853             : 
    2854           4 :                 res = inet_ctl_sock_create(&sk, PF_INET, SOCK_RAW,
    2855             :                                            IPPROTO_TCP, net);
    2856           4 :                 if (res)
    2857           0 :                         goto fail;
    2858           4 :                 sock_set_flag(sk, SOCK_USE_WRITE_QUEUE);
    2859             : 
    2860             :                 /* Please enforce IP_DF and IPID==0 for RST and
    2861             :                  * ACK sent in SYN-RECV and TIME-WAIT state.
    2862             :                  */
    2863           4 :                 inet_sk(sk)->pmtudisc = IP_PMTUDISC_DO;
    2864             : 
    2865           4 :                 *per_cpu_ptr(net->ipv4.tcp_sk, cpu) = sk;
    2866             :         }
    2867             : 
    2868           1 :         net->ipv4.sysctl_tcp_ecn = 2;
    2869           1 :         net->ipv4.sysctl_tcp_ecn_fallback = 1;
    2870             : 
    2871           1 :         net->ipv4.sysctl_tcp_base_mss = TCP_BASE_MSS;
    2872           1 :         net->ipv4.sysctl_tcp_min_snd_mss = TCP_MIN_SND_MSS;
    2873           1 :         net->ipv4.sysctl_tcp_probe_threshold = TCP_PROBE_THRESHOLD;
    2874           1 :         net->ipv4.sysctl_tcp_probe_interval = TCP_PROBE_INTERVAL;
    2875           1 :         net->ipv4.sysctl_tcp_mtu_probe_floor = TCP_MIN_SND_MSS;
    2876             : 
    2877           1 :         net->ipv4.sysctl_tcp_keepalive_time = TCP_KEEPALIVE_TIME;
    2878           1 :         net->ipv4.sysctl_tcp_keepalive_probes = TCP_KEEPALIVE_PROBES;
    2879           1 :         net->ipv4.sysctl_tcp_keepalive_intvl = TCP_KEEPALIVE_INTVL;
    2880             : 
    2881           1 :         net->ipv4.sysctl_tcp_syn_retries = TCP_SYN_RETRIES;
    2882           1 :         net->ipv4.sysctl_tcp_synack_retries = TCP_SYNACK_RETRIES;
    2883           1 :         net->ipv4.sysctl_tcp_syncookies = 1;
    2884           1 :         net->ipv4.sysctl_tcp_reordering = TCP_FASTRETRANS_THRESH;
    2885           1 :         net->ipv4.sysctl_tcp_retries1 = TCP_RETR1;
    2886           1 :         net->ipv4.sysctl_tcp_retries2 = TCP_RETR2;
    2887           1 :         net->ipv4.sysctl_tcp_orphan_retries = 0;
    2888           1 :         net->ipv4.sysctl_tcp_fin_timeout = TCP_FIN_TIMEOUT;
    2889           1 :         net->ipv4.sysctl_tcp_notsent_lowat = UINT_MAX;
    2890           1 :         net->ipv4.sysctl_tcp_tw_reuse = 2;
    2891           1 :         net->ipv4.sysctl_tcp_no_ssthresh_metrics_save = 1;
    2892             : 
    2893           1 :         cnt = tcp_hashinfo.ehash_mask + 1;
    2894           1 :         net->ipv4.tcp_death_row.sysctl_max_tw_buckets = cnt / 2;
    2895           1 :         net->ipv4.tcp_death_row.hashinfo = &tcp_hashinfo;
    2896             : 
    2897           1 :         net->ipv4.sysctl_max_syn_backlog = max(128, cnt / 128);
    2898           1 :         net->ipv4.sysctl_tcp_sack = 1;
    2899           1 :         net->ipv4.sysctl_tcp_window_scaling = 1;
    2900           1 :         net->ipv4.sysctl_tcp_timestamps = 1;
    2901           1 :         net->ipv4.sysctl_tcp_early_retrans = 3;
    2902           1 :         net->ipv4.sysctl_tcp_recovery = TCP_RACK_LOSS_DETECTION;
    2903           1 :         net->ipv4.sysctl_tcp_slow_start_after_idle = 1; /* By default, RFC2861 behavior.  */
    2904           1 :         net->ipv4.sysctl_tcp_retrans_collapse = 1;
    2905           1 :         net->ipv4.sysctl_tcp_max_reordering = 300;
    2906           1 :         net->ipv4.sysctl_tcp_dsack = 1;
    2907           1 :         net->ipv4.sysctl_tcp_app_win = 31;
    2908           1 :         net->ipv4.sysctl_tcp_adv_win_scale = 1;
    2909           1 :         net->ipv4.sysctl_tcp_frto = 2;
    2910           1 :         net->ipv4.sysctl_tcp_moderate_rcvbuf = 1;
    2911             :         /* This limits the percentage of the congestion window which we
    2912             :          * will allow a single TSO frame to consume.  Building TSO frames
    2913             :          * which are too large can cause TCP streams to be bursty.
    2914             :          */
    2915           1 :         net->ipv4.sysctl_tcp_tso_win_divisor = 3;
    2916             :         /* Default TSQ limit of 16 TSO segments */
    2917           1 :         net->ipv4.sysctl_tcp_limit_output_bytes = 16 * 65536;
    2918             :         /* rfc5961 challenge ack rate limiting */
    2919           1 :         net->ipv4.sysctl_tcp_challenge_ack_limit = 1000;
    2920           1 :         net->ipv4.sysctl_tcp_min_tso_segs = 2;
    2921           1 :         net->ipv4.sysctl_tcp_min_rtt_wlen = 300;
    2922           1 :         net->ipv4.sysctl_tcp_autocorking = 1;
    2923           1 :         net->ipv4.sysctl_tcp_invalid_ratelimit = HZ/2;
    2924           1 :         net->ipv4.sysctl_tcp_pacing_ss_ratio = 200;
    2925           1 :         net->ipv4.sysctl_tcp_pacing_ca_ratio = 120;
    2926           1 :         if (net != &init_net) {
    2927           0 :                 memcpy(net->ipv4.sysctl_tcp_rmem,
    2928             :                        init_net.ipv4.sysctl_tcp_rmem,
    2929             :                        sizeof(init_net.ipv4.sysctl_tcp_rmem));
    2930           0 :                 memcpy(net->ipv4.sysctl_tcp_wmem,
    2931             :                        init_net.ipv4.sysctl_tcp_wmem,
    2932             :                        sizeof(init_net.ipv4.sysctl_tcp_wmem));
    2933             :         }
    2934           1 :         net->ipv4.sysctl_tcp_comp_sack_delay_ns = NSEC_PER_MSEC;
    2935           1 :         net->ipv4.sysctl_tcp_comp_sack_slack_ns = 100 * NSEC_PER_USEC;
    2936           1 :         net->ipv4.sysctl_tcp_comp_sack_nr = 44;
    2937           1 :         net->ipv4.sysctl_tcp_fastopen = TFO_CLIENT_ENABLE;
    2938           1 :         spin_lock_init(&net->ipv4.tcp_fastopen_ctx_lock);
    2939           1 :         net->ipv4.sysctl_tcp_fastopen_blackhole_timeout = 60 * 60;
    2940           1 :         atomic_set(&net->ipv4.tfo_active_disable_times, 0);
    2941             : 
    2942             :         /* Reno is always built in */
    2943           1 :         if (!net_eq(net, &init_net) &&
    2944             :             bpf_try_module_get(init_net.ipv4.tcp_congestion_control,
    2945             :                                init_net.ipv4.tcp_congestion_control->owner))
    2946             :                 net->ipv4.tcp_congestion_control = init_net.ipv4.tcp_congestion_control;
    2947             :         else
    2948           1 :                 net->ipv4.tcp_congestion_control = &tcp_reno;
    2949             : 
    2950           1 :         return 0;
    2951           0 : fail:
    2952           0 :         tcp_sk_exit(net);
    2953             : 
    2954           0 :         return res;
    2955             : }
    2956             : 
    2957           0 : static void __net_exit tcp_sk_exit_batch(struct list_head *net_exit_list)
    2958             : {
    2959           0 :         struct net *net;
    2960             : 
    2961           0 :         inet_twsk_purge(&tcp_hashinfo, AF_INET);
    2962             : 
    2963           0 :         list_for_each_entry(net, net_exit_list, exit_list)
    2964           0 :                 tcp_fastopen_ctx_destroy(net);
    2965           0 : }
    2966             : 
    2967             : static struct pernet_operations __net_initdata tcp_sk_ops = {
    2968             :        .init       = tcp_sk_init,
    2969             :        .exit       = tcp_sk_exit,
    2970             :        .exit_batch = tcp_sk_exit_batch,
    2971             : };
    2972             : 
    2973             : #if defined(CONFIG_BPF_SYSCALL) && defined(CONFIG_PROC_FS)
    2974             : DEFINE_BPF_ITER_FUNC(tcp, struct bpf_iter_meta *meta,
    2975             :                      struct sock_common *sk_common, uid_t uid)
    2976             : 
    2977             : static int bpf_iter_init_tcp(void *priv_data, struct bpf_iter_aux_info *aux)
    2978             : {
    2979             :         struct tcp_iter_state *st = priv_data;
    2980             :         struct tcp_seq_afinfo *afinfo;
    2981             :         int ret;
    2982             : 
    2983             :         afinfo = kmalloc(sizeof(*afinfo), GFP_USER | __GFP_NOWARN);
    2984             :         if (!afinfo)
    2985             :                 return -ENOMEM;
    2986             : 
    2987             :         afinfo->family = AF_UNSPEC;
    2988             :         st->bpf_seq_afinfo = afinfo;
    2989             :         ret = bpf_iter_init_seq_net(priv_data, aux);
    2990             :         if (ret)
    2991             :                 kfree(afinfo);
    2992             :         return ret;
    2993             : }
    2994             : 
    2995             : static void bpf_iter_fini_tcp(void *priv_data)
    2996             : {
    2997             :         struct tcp_iter_state *st = priv_data;
    2998             : 
    2999             :         kfree(st->bpf_seq_afinfo);
    3000             :         bpf_iter_fini_seq_net(priv_data);
    3001             : }
    3002             : 
    3003             : static const struct bpf_iter_seq_info tcp_seq_info = {
    3004             :         .seq_ops                = &bpf_iter_tcp_seq_ops,
    3005             :         .init_seq_private       = bpf_iter_init_tcp,
    3006             :         .fini_seq_private       = bpf_iter_fini_tcp,
    3007             :         .seq_priv_size          = sizeof(struct tcp_iter_state),
    3008             : };
    3009             : 
    3010             : static struct bpf_iter_reg tcp_reg_info = {
    3011             :         .target                 = "tcp",
    3012             :         .ctx_arg_info_size      = 1,
    3013             :         .ctx_arg_info           = {
    3014             :                 { offsetof(struct bpf_iter__tcp, sk_common),
    3015             :                   PTR_TO_BTF_ID_OR_NULL },
    3016             :         },
    3017             :         .seq_info               = &tcp_seq_info,
    3018             : };
    3019             : 
    3020             : static void __init bpf_iter_register(void)
    3021             : {
    3022             :         tcp_reg_info.ctx_arg_info[0].btf_id = btf_sock_ids[BTF_SOCK_TYPE_SOCK_COMMON];
    3023             :         if (bpf_iter_reg_target(&tcp_reg_info))
    3024             :                 pr_warn("Warning: could not register bpf iterator tcp\n");
    3025             : }
    3026             : 
    3027             : #endif
    3028             : 
    3029           1 : void __init tcp_v4_init(void)
    3030             : {
    3031           1 :         if (register_pernet_subsys(&tcp_sk_ops))
    3032           0 :                 panic("Failed to create the TCP control socket.\n");
    3033             : 
    3034             : #if defined(CONFIG_BPF_SYSCALL) && defined(CONFIG_PROC_FS)
    3035             :         bpf_iter_register();
    3036             : #endif
    3037           1 : }

Generated by: LCOV version 1.14