Line data Source code
1 : // SPDX-License-Identifier: GPL-2.0-or-later
2 : /*
3 : * INET An implementation of the TCP/IP protocol suite for the LINUX
4 : * operating system. INET is implemented using the BSD Socket
5 : * interface as the means of communication with the user level.
6 : *
7 : * Implementation of the Transmission Control Protocol(TCP).
8 : *
9 : * IPv4 specific functions
10 : *
11 : * code split from:
12 : * linux/ipv4/tcp.c
13 : * linux/ipv4/tcp_input.c
14 : * linux/ipv4/tcp_output.c
15 : *
16 : * See tcp.c for author information
17 : */
18 :
19 : /*
20 : * Changes:
21 : * David S. Miller : New socket lookup architecture.
22 : * This code is dedicated to John Dyson.
23 : * David S. Miller : Change semantics of established hash,
24 : * half is devoted to TIME_WAIT sockets
25 : * and the rest go in the other half.
26 : * Andi Kleen : Add support for syncookies and fixed
27 : * some bugs: ip options weren't passed to
28 : * the TCP layer, missed a check for an
29 : * ACK bit.
30 : * Andi Kleen : Implemented fast path mtu discovery.
31 : * Fixed many serious bugs in the
32 : * request_sock handling and moved
33 : * most of it into the af independent code.
34 : * Added tail drop and some other bugfixes.
35 : * Added new listen semantics.
36 : * Mike McLagan : Routing by source
37 : * Juan Jose Ciarlante: ip_dynaddr bits
38 : * Andi Kleen: various fixes.
39 : * Vitaly E. Lavrov : Transparent proxy revived after year
40 : * coma.
41 : * Andi Kleen : Fix new listen.
42 : * Andi Kleen : Fix accept error reporting.
43 : * YOSHIFUJI Hideaki @USAGI and: Support IPV6_V6ONLY socket option, which
44 : * Alexey Kuznetsov allow both IPv4 and IPv6 sockets to bind
45 : * a single port at the same time.
46 : */
47 :
48 : #define pr_fmt(fmt) "TCP: " fmt
49 :
50 : #include <linux/bottom_half.h>
51 : #include <linux/types.h>
52 : #include <linux/fcntl.h>
53 : #include <linux/module.h>
54 : #include <linux/random.h>
55 : #include <linux/cache.h>
56 : #include <linux/jhash.h>
57 : #include <linux/init.h>
58 : #include <linux/times.h>
59 : #include <linux/slab.h>
60 :
61 : #include <net/net_namespace.h>
62 : #include <net/icmp.h>
63 : #include <net/inet_hashtables.h>
64 : #include <net/tcp.h>
65 : #include <net/transp_v6.h>
66 : #include <net/ipv6.h>
67 : #include <net/inet_common.h>
68 : #include <net/timewait_sock.h>
69 : #include <net/xfrm.h>
70 : #include <net/secure_seq.h>
71 : #include <net/busy_poll.h>
72 :
73 : #include <linux/inet.h>
74 : #include <linux/ipv6.h>
75 : #include <linux/stddef.h>
76 : #include <linux/proc_fs.h>
77 : #include <linux/seq_file.h>
78 : #include <linux/inetdevice.h>
79 : #include <linux/btf_ids.h>
80 :
81 : #include <crypto/hash.h>
82 : #include <linux/scatterlist.h>
83 :
84 : #include <trace/events/tcp.h>
85 :
86 : #ifdef CONFIG_TCP_MD5SIG
87 : static int tcp_v4_md5_hash_hdr(char *md5_hash, const struct tcp_md5sig_key *key,
88 : __be32 daddr, __be32 saddr, const struct tcphdr *th);
89 : #endif
90 :
91 : struct inet_hashinfo tcp_hashinfo;
92 : EXPORT_SYMBOL(tcp_hashinfo);
93 :
94 4 : static u32 tcp_v4_init_seq(const struct sk_buff *skb)
95 : {
96 4 : return secure_tcp_seq(ip_hdr(skb)->daddr,
97 4 : ip_hdr(skb)->saddr,
98 4 : tcp_hdr(skb)->dest,
99 4 : tcp_hdr(skb)->source);
100 : }
101 :
102 0 : static u32 tcp_v4_init_ts_off(const struct net *net, const struct sk_buff *skb)
103 : {
104 0 : return secure_tcp_ts_off(net, ip_hdr(skb)->daddr, ip_hdr(skb)->saddr);
105 : }
106 :
107 0 : int tcp_twsk_unique(struct sock *sk, struct sock *sktw, void *twp)
108 : {
109 0 : const struct inet_timewait_sock *tw = inet_twsk(sktw);
110 0 : const struct tcp_timewait_sock *tcptw = tcp_twsk(sktw);
111 0 : struct tcp_sock *tp = tcp_sk(sk);
112 0 : int reuse = sock_net(sk)->ipv4.sysctl_tcp_tw_reuse;
113 :
114 0 : if (reuse == 2) {
115 : /* Still does not detect *everything* that goes through
116 : * lo, since we require a loopback src or dst address
117 : * or direct binding to 'lo' interface.
118 : */
119 0 : bool loopback = false;
120 0 : if (tw->tw_bound_dev_if == LOOPBACK_IFINDEX)
121 0 : loopback = true;
122 : #if IS_ENABLED(CONFIG_IPV6)
123 : if (tw->tw_family == AF_INET6) {
124 : if (ipv6_addr_loopback(&tw->tw_v6_daddr) ||
125 : ipv6_addr_v4mapped_loopback(&tw->tw_v6_daddr) ||
126 : ipv6_addr_loopback(&tw->tw_v6_rcv_saddr) ||
127 : ipv6_addr_v4mapped_loopback(&tw->tw_v6_rcv_saddr))
128 : loopback = true;
129 : } else
130 : #endif
131 : {
132 0 : if (ipv4_is_loopback(tw->tw_daddr) ||
133 0 : ipv4_is_loopback(tw->tw_rcv_saddr))
134 : loopback = true;
135 : }
136 0 : if (!loopback)
137 0 : reuse = 0;
138 : }
139 :
140 : /* With PAWS, it is safe from the viewpoint
141 : of data integrity. Even without PAWS it is safe provided sequence
142 : spaces do not overlap i.e. at data rates <= 80Mbit/sec.
143 :
144 : Actually, the idea is close to VJ's one, only timestamp cache is
145 : held not per host, but per port pair and TW bucket is used as state
146 : holder.
147 :
148 : If TW bucket has been already destroyed we fall back to VJ's scheme
149 : and use initial timestamp retrieved from peer table.
150 : */
151 0 : if (tcptw->tw_ts_recent_stamp &&
152 0 : (!twp || (reuse && time_after32(ktime_get_seconds(),
153 : tcptw->tw_ts_recent_stamp)))) {
154 : /* In case of repair and re-using TIME-WAIT sockets we still
155 : * want to be sure that it is safe as above but honor the
156 : * sequence numbers and time stamps set as part of the repair
157 : * process.
158 : *
159 : * Without this check re-using a TIME-WAIT socket with TCP
160 : * repair would accumulate a -1 on the repair assigned
161 : * sequence number. The first time it is reused the sequence
162 : * is -1, the second time -2, etc. This fixes that issue
163 : * without appearing to create any others.
164 : */
165 0 : if (likely(!tp->repair)) {
166 0 : u32 seq = tcptw->tw_snd_nxt + 65535 + 2;
167 :
168 0 : if (!seq)
169 : seq = 1;
170 0 : WRITE_ONCE(tp->write_seq, seq);
171 0 : tp->rx_opt.ts_recent = tcptw->tw_ts_recent;
172 0 : tp->rx_opt.ts_recent_stamp = tcptw->tw_ts_recent_stamp;
173 : }
174 0 : sock_hold(sktw);
175 0 : return 1;
176 : }
177 :
178 : return 0;
179 : }
180 : EXPORT_SYMBOL_GPL(tcp_twsk_unique);
181 :
182 0 : static int tcp_v4_pre_connect(struct sock *sk, struct sockaddr *uaddr,
183 : int addr_len)
184 : {
185 : /* This check is replicated from tcp_v4_connect() and intended to
186 : * prevent BPF program called below from accessing bytes that are out
187 : * of the bound specified by user in addr_len.
188 : */
189 0 : if (addr_len < sizeof(struct sockaddr_in))
190 : return -EINVAL;
191 :
192 0 : sock_owned_by_me(sk);
193 :
194 0 : return BPF_CGROUP_RUN_PROG_INET4_CONNECT(sk, uaddr);
195 : }
196 :
197 : /* This will initiate an outgoing connection. */
198 0 : int tcp_v4_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len)
199 : {
200 0 : struct sockaddr_in *usin = (struct sockaddr_in *)uaddr;
201 0 : struct inet_sock *inet = inet_sk(sk);
202 0 : struct tcp_sock *tp = tcp_sk(sk);
203 0 : __be16 orig_sport, orig_dport;
204 0 : __be32 daddr, nexthop;
205 0 : struct flowi4 *fl4;
206 0 : struct rtable *rt;
207 0 : int err;
208 0 : struct ip_options_rcu *inet_opt;
209 0 : struct inet_timewait_death_row *tcp_death_row = &sock_net(sk)->ipv4.tcp_death_row;
210 :
211 0 : if (addr_len < sizeof(struct sockaddr_in))
212 : return -EINVAL;
213 :
214 0 : if (usin->sin_family != AF_INET)
215 : return -EAFNOSUPPORT;
216 :
217 0 : nexthop = daddr = usin->sin_addr.s_addr;
218 0 : inet_opt = rcu_dereference_protected(inet->inet_opt,
219 : lockdep_sock_is_held(sk));
220 0 : if (inet_opt && inet_opt->opt.srr) {
221 0 : if (!daddr)
222 : return -EINVAL;
223 0 : nexthop = inet_opt->opt.faddr;
224 : }
225 :
226 0 : orig_sport = inet->inet_sport;
227 0 : orig_dport = usin->sin_port;
228 0 : fl4 = &inet->cork.fl.u.ip4;
229 0 : rt = ip_route_connect(fl4, nexthop, inet->inet_saddr,
230 0 : RT_CONN_FLAGS(sk), sk->sk_bound_dev_if,
231 : IPPROTO_TCP,
232 : orig_sport, orig_dport, sk);
233 0 : if (IS_ERR(rt)) {
234 0 : err = PTR_ERR(rt);
235 0 : if (err == -ENETUNREACH)
236 0 : IP_INC_STATS(sock_net(sk), IPSTATS_MIB_OUTNOROUTES);
237 0 : return err;
238 : }
239 :
240 0 : if (rt->rt_flags & (RTCF_MULTICAST | RTCF_BROADCAST)) {
241 0 : ip_rt_put(rt);
242 0 : return -ENETUNREACH;
243 : }
244 :
245 0 : if (!inet_opt || !inet_opt->opt.srr)
246 0 : daddr = fl4->daddr;
247 :
248 0 : if (!inet->inet_saddr)
249 0 : inet->inet_saddr = fl4->saddr;
250 0 : sk_rcv_saddr_set(sk, inet->inet_saddr);
251 :
252 0 : if (tp->rx_opt.ts_recent_stamp && inet->inet_daddr != daddr) {
253 : /* Reset inherited state */
254 0 : tp->rx_opt.ts_recent = 0;
255 0 : tp->rx_opt.ts_recent_stamp = 0;
256 0 : if (likely(!tp->repair))
257 0 : WRITE_ONCE(tp->write_seq, 0);
258 : }
259 :
260 0 : inet->inet_dport = usin->sin_port;
261 0 : sk_daddr_set(sk, daddr);
262 :
263 0 : inet_csk(sk)->icsk_ext_hdr_len = 0;
264 0 : if (inet_opt)
265 0 : inet_csk(sk)->icsk_ext_hdr_len = inet_opt->opt.optlen;
266 :
267 0 : tp->rx_opt.mss_clamp = TCP_MSS_DEFAULT;
268 :
269 : /* Socket identity is still unknown (sport may be zero).
270 : * However we set state to SYN-SENT and not releasing socket
271 : * lock select source port, enter ourselves into the hash tables and
272 : * complete initialization after this.
273 : */
274 0 : tcp_set_state(sk, TCP_SYN_SENT);
275 0 : err = inet_hash_connect(tcp_death_row, sk);
276 0 : if (err)
277 0 : goto failure;
278 :
279 0 : sk_set_txhash(sk);
280 :
281 0 : rt = ip_route_newports(fl4, rt, orig_sport, orig_dport,
282 0 : inet->inet_sport, inet->inet_dport, sk);
283 0 : if (IS_ERR(rt)) {
284 0 : err = PTR_ERR(rt);
285 0 : rt = NULL;
286 0 : goto failure;
287 : }
288 : /* OK, now commit destination to socket. */
289 0 : sk->sk_gso_type = SKB_GSO_TCPV4;
290 0 : sk_setup_caps(sk, &rt->dst);
291 0 : rt = NULL;
292 :
293 0 : if (likely(!tp->repair)) {
294 0 : if (!tp->write_seq)
295 0 : WRITE_ONCE(tp->write_seq,
296 : secure_tcp_seq(inet->inet_saddr,
297 : inet->inet_daddr,
298 : inet->inet_sport,
299 : usin->sin_port));
300 0 : tp->tsoffset = secure_tcp_ts_off(sock_net(sk),
301 : inet->inet_saddr,
302 : inet->inet_daddr);
303 : }
304 :
305 0 : inet->inet_id = prandom_u32();
306 :
307 0 : if (tcp_fastopen_defer_connect(sk, &err))
308 0 : return err;
309 0 : if (err)
310 0 : goto failure;
311 :
312 0 : err = tcp_connect(sk);
313 :
314 0 : if (err)
315 0 : goto failure;
316 :
317 : return 0;
318 :
319 0 : failure:
320 : /*
321 : * This unhashes the socket and releases the local port,
322 : * if necessary.
323 : */
324 0 : tcp_set_state(sk, TCP_CLOSE);
325 0 : ip_rt_put(rt);
326 0 : sk->sk_route_caps = 0;
327 0 : inet->inet_dport = 0;
328 0 : return err;
329 : }
330 : EXPORT_SYMBOL(tcp_v4_connect);
331 :
332 : /*
333 : * This routine reacts to ICMP_FRAG_NEEDED mtu indications as defined in RFC1191.
334 : * It can be called through tcp_release_cb() if socket was owned by user
335 : * at the time tcp_v4_err() was called to handle ICMP message.
336 : */
337 0 : void tcp_v4_mtu_reduced(struct sock *sk)
338 : {
339 0 : struct inet_sock *inet = inet_sk(sk);
340 0 : struct dst_entry *dst;
341 0 : u32 mtu;
342 :
343 0 : if ((1 << sk->sk_state) & (TCPF_LISTEN | TCPF_CLOSE))
344 : return;
345 0 : mtu = tcp_sk(sk)->mtu_info;
346 0 : dst = inet_csk_update_pmtu(sk, mtu);
347 0 : if (!dst)
348 : return;
349 :
350 : /* Something is about to be wrong... Remember soft error
351 : * for the case, if this connection will not able to recover.
352 : */
353 0 : if (mtu < dst_mtu(dst) && ip_dont_fragment(sk, dst))
354 0 : sk->sk_err_soft = EMSGSIZE;
355 :
356 0 : mtu = dst_mtu(dst);
357 :
358 0 : if (inet->pmtudisc != IP_PMTUDISC_DONT &&
359 0 : ip_sk_accept_pmtu(sk) &&
360 0 : inet_csk(sk)->icsk_pmtu_cookie > mtu) {
361 0 : tcp_sync_mss(sk, mtu);
362 :
363 : /* Resend the TCP packet because it's
364 : * clear that the old packet has been
365 : * dropped. This is the new "fast" path mtu
366 : * discovery.
367 : */
368 0 : tcp_simple_retransmit(sk);
369 : } /* else let the usual retransmit timer handle it */
370 : }
371 : EXPORT_SYMBOL(tcp_v4_mtu_reduced);
372 :
373 0 : static void do_redirect(struct sk_buff *skb, struct sock *sk)
374 : {
375 0 : struct dst_entry *dst = __sk_dst_check(sk, 0);
376 :
377 0 : if (dst)
378 0 : dst->ops->redirect(dst, sk, skb);
379 0 : }
380 :
381 :
382 : /* handle ICMP messages on TCP_NEW_SYN_RECV request sockets */
383 0 : void tcp_req_err(struct sock *sk, u32 seq, bool abort)
384 : {
385 0 : struct request_sock *req = inet_reqsk(sk);
386 0 : struct net *net = sock_net(sk);
387 :
388 : /* ICMPs are not backlogged, hence we cannot get
389 : * an established socket here.
390 : */
391 0 : if (seq != tcp_rsk(req)->snt_isn) {
392 0 : __NET_INC_STATS(net, LINUX_MIB_OUTOFWINDOWICMPS);
393 0 : } else if (abort) {
394 : /*
395 : * Still in SYN_RECV, just remove it silently.
396 : * There is no good way to pass the error to the newly
397 : * created socket, and POSIX does not want network
398 : * errors returned from accept().
399 : */
400 0 : inet_csk_reqsk_queue_drop(req->rsk_listener, req);
401 0 : tcp_listendrop(req->rsk_listener);
402 : }
403 0 : reqsk_put(req);
404 0 : }
405 : EXPORT_SYMBOL(tcp_req_err);
406 :
407 : /* TCP-LD (RFC 6069) logic */
408 0 : void tcp_ld_RTO_revert(struct sock *sk, u32 seq)
409 : {
410 0 : struct inet_connection_sock *icsk = inet_csk(sk);
411 0 : struct tcp_sock *tp = tcp_sk(sk);
412 0 : struct sk_buff *skb;
413 0 : s32 remaining;
414 0 : u32 delta_us;
415 :
416 0 : if (sock_owned_by_user(sk))
417 : return;
418 :
419 0 : if (seq != tp->snd_una || !icsk->icsk_retransmits ||
420 0 : !icsk->icsk_backoff)
421 : return;
422 :
423 0 : skb = tcp_rtx_queue_head(sk);
424 0 : if (WARN_ON_ONCE(!skb))
425 : return;
426 :
427 0 : icsk->icsk_backoff--;
428 0 : icsk->icsk_rto = tp->srtt_us ? __tcp_set_rto(tp) : TCP_TIMEOUT_INIT;
429 0 : icsk->icsk_rto = inet_csk_rto_backoff(icsk, TCP_RTO_MAX);
430 :
431 0 : tcp_mstamp_refresh(tp);
432 0 : delta_us = (u32)(tp->tcp_mstamp - tcp_skb_timestamp_us(skb));
433 0 : remaining = icsk->icsk_rto - usecs_to_jiffies(delta_us);
434 :
435 0 : if (remaining > 0) {
436 0 : inet_csk_reset_xmit_timer(sk, ICSK_TIME_RETRANS,
437 : remaining, TCP_RTO_MAX);
438 : } else {
439 : /* RTO revert clocked out retransmission.
440 : * Will retransmit now.
441 : */
442 0 : tcp_retransmit_timer(sk);
443 : }
444 : }
445 : EXPORT_SYMBOL(tcp_ld_RTO_revert);
446 :
447 : /*
448 : * This routine is called by the ICMP module when it gets some
449 : * sort of error condition. If err < 0 then the socket should
450 : * be closed and the error returned to the user. If err > 0
451 : * it's just the icmp type << 8 | icmp code. After adjustment
452 : * header points to the first 8 bytes of the tcp header. We need
453 : * to find the appropriate port.
454 : *
455 : * The locking strategy used here is very "optimistic". When
456 : * someone else accesses the socket the ICMP is just dropped
457 : * and for some paths there is no check at all.
458 : * A more general error queue to queue errors for later handling
459 : * is probably better.
460 : *
461 : */
462 :
463 0 : int tcp_v4_err(struct sk_buff *skb, u32 info)
464 : {
465 0 : const struct iphdr *iph = (const struct iphdr *)skb->data;
466 0 : struct tcphdr *th = (struct tcphdr *)(skb->data + (iph->ihl << 2));
467 0 : struct tcp_sock *tp;
468 0 : struct inet_sock *inet;
469 0 : const int type = icmp_hdr(skb)->type;
470 0 : const int code = icmp_hdr(skb)->code;
471 0 : struct sock *sk;
472 0 : struct request_sock *fastopen;
473 0 : u32 seq, snd_una;
474 0 : int err;
475 0 : struct net *net = dev_net(skb->dev);
476 :
477 0 : sk = __inet_lookup_established(net, &tcp_hashinfo, iph->daddr,
478 0 : th->dest, iph->saddr, ntohs(th->source),
479 : inet_iif(skb), 0);
480 0 : if (!sk) {
481 0 : __ICMP_INC_STATS(net, ICMP_MIB_INERRORS);
482 0 : return -ENOENT;
483 : }
484 0 : if (sk->sk_state == TCP_TIME_WAIT) {
485 0 : inet_twsk_put(inet_twsk(sk));
486 0 : return 0;
487 : }
488 0 : seq = ntohl(th->seq);
489 0 : if (sk->sk_state == TCP_NEW_SYN_RECV) {
490 0 : tcp_req_err(sk, seq, type == ICMP_PARAMETERPROB ||
491 0 : type == ICMP_TIME_EXCEEDED ||
492 0 : (type == ICMP_DEST_UNREACH &&
493 0 : (code == ICMP_NET_UNREACH ||
494 : code == ICMP_HOST_UNREACH)));
495 0 : return 0;
496 : }
497 :
498 0 : bh_lock_sock(sk);
499 : /* If too many ICMPs get dropped on busy
500 : * servers this needs to be solved differently.
501 : * We do take care of PMTU discovery (RFC1191) special case :
502 : * we can receive locally generated ICMP messages while socket is held.
503 : */
504 0 : if (sock_owned_by_user(sk)) {
505 0 : if (!(type == ICMP_DEST_UNREACH && code == ICMP_FRAG_NEEDED))
506 0 : __NET_INC_STATS(net, LINUX_MIB_LOCKDROPPEDICMPS);
507 : }
508 0 : if (sk->sk_state == TCP_CLOSE)
509 0 : goto out;
510 :
511 0 : if (unlikely(iph->ttl < inet_sk(sk)->min_ttl)) {
512 0 : __NET_INC_STATS(net, LINUX_MIB_TCPMINTTLDROP);
513 0 : goto out;
514 : }
515 :
516 0 : tp = tcp_sk(sk);
517 : /* XXX (TFO) - tp->snd_una should be ISN (tcp_create_openreq_child() */
518 0 : fastopen = rcu_dereference(tp->fastopen_rsk);
519 0 : snd_una = fastopen ? tcp_rsk(fastopen)->snt_isn : tp->snd_una;
520 0 : if (sk->sk_state != TCP_LISTEN &&
521 0 : !between(seq, snd_una, tp->snd_nxt)) {
522 0 : __NET_INC_STATS(net, LINUX_MIB_OUTOFWINDOWICMPS);
523 0 : goto out;
524 : }
525 :
526 0 : switch (type) {
527 : case ICMP_REDIRECT:
528 0 : if (!sock_owned_by_user(sk))
529 0 : do_redirect(skb, sk);
530 0 : goto out;
531 0 : case ICMP_SOURCE_QUENCH:
532 : /* Just silently ignore these. */
533 0 : goto out;
534 : case ICMP_PARAMETERPROB:
535 : err = EPROTO;
536 : break;
537 0 : case ICMP_DEST_UNREACH:
538 0 : if (code > NR_ICMP_UNREACH)
539 0 : goto out;
540 :
541 0 : if (code == ICMP_FRAG_NEEDED) { /* PMTU discovery (RFC1191) */
542 : /* We are not interested in TCP_LISTEN and open_requests
543 : * (SYN-ACKs send out by Linux are always <576bytes so
544 : * they should go through unfragmented).
545 : */
546 0 : if (sk->sk_state == TCP_LISTEN)
547 0 : goto out;
548 :
549 0 : tp->mtu_info = info;
550 0 : if (!sock_owned_by_user(sk)) {
551 0 : tcp_v4_mtu_reduced(sk);
552 : } else {
553 0 : if (!test_and_set_bit(TCP_MTU_REDUCED_DEFERRED, &sk->sk_tsq_flags))
554 0 : sock_hold(sk);
555 : }
556 0 : goto out;
557 : }
558 :
559 0 : err = icmp_err_convert[code].errno;
560 : /* check if this ICMP message allows revert of backoff.
561 : * (see RFC 6069)
562 : */
563 0 : if (!fastopen &&
564 0 : (code == ICMP_NET_UNREACH || code == ICMP_HOST_UNREACH))
565 0 : tcp_ld_RTO_revert(sk, seq);
566 : break;
567 0 : case ICMP_TIME_EXCEEDED:
568 0 : err = EHOSTUNREACH;
569 0 : break;
570 0 : default:
571 0 : goto out;
572 : }
573 :
574 0 : switch (sk->sk_state) {
575 0 : case TCP_SYN_SENT:
576 : case TCP_SYN_RECV:
577 : /* Only in fast or simultaneous open. If a fast open socket is
578 : * already accepted it is treated as a connected one below.
579 : */
580 0 : if (fastopen && !fastopen->sk)
581 : break;
582 :
583 0 : ip_icmp_error(sk, skb, err, th->dest, info, (u8 *)th);
584 :
585 0 : if (!sock_owned_by_user(sk)) {
586 0 : sk->sk_err = err;
587 :
588 0 : sk->sk_error_report(sk);
589 :
590 0 : tcp_done(sk);
591 : } else {
592 0 : sk->sk_err_soft = err;
593 : }
594 0 : goto out;
595 : }
596 :
597 : /* If we've already connected we will keep trying
598 : * until we time out, or the user gives up.
599 : *
600 : * rfc1122 4.2.3.9 allows to consider as hard errors
601 : * only PROTO_UNREACH and PORT_UNREACH (well, FRAG_FAILED too,
602 : * but it is obsoleted by pmtu discovery).
603 : *
604 : * Note, that in modern internet, where routing is unreliable
605 : * and in each dark corner broken firewalls sit, sending random
606 : * errors ordered by their masters even this two messages finally lose
607 : * their original sense (even Linux sends invalid PORT_UNREACHs)
608 : *
609 : * Now we are in compliance with RFCs.
610 : * --ANK (980905)
611 : */
612 :
613 0 : inet = inet_sk(sk);
614 0 : if (!sock_owned_by_user(sk) && inet->recverr) {
615 0 : sk->sk_err = err;
616 0 : sk->sk_error_report(sk);
617 : } else { /* Only an error on timeout */
618 0 : sk->sk_err_soft = err;
619 : }
620 :
621 0 : out:
622 0 : bh_unlock_sock(sk);
623 0 : sock_put(sk);
624 0 : return 0;
625 : }
626 :
627 430 : void __tcp_v4_send_check(struct sk_buff *skb, __be32 saddr, __be32 daddr)
628 : {
629 430 : struct tcphdr *th = tcp_hdr(skb);
630 :
631 430 : th->check = ~tcp_v4_check(skb->len, saddr, daddr, 0);
632 430 : skb->csum_start = skb_transport_header(skb) - skb->head;
633 430 : skb->csum_offset = offsetof(struct tcphdr, check);
634 430 : }
635 :
636 : /* This routine computes an IPv4 TCP checksum. */
637 426 : void tcp_v4_send_check(struct sock *sk, struct sk_buff *skb)
638 : {
639 426 : const struct inet_sock *inet = inet_sk(sk);
640 :
641 426 : __tcp_v4_send_check(skb, inet->inet_saddr, inet->inet_daddr);
642 426 : }
643 : EXPORT_SYMBOL(tcp_v4_send_check);
644 :
645 : /*
646 : * This routine will send an RST to the other tcp.
647 : *
648 : * Someone asks: why I NEVER use socket parameters (TOS, TTL etc.)
649 : * for reset.
650 : * Answer: if a packet caused RST, it is not for a socket
651 : * existing in our system, if it is matched to a socket,
652 : * it is just duplicate segment or bug in other side's TCP.
653 : * So that we build reply only basing on parameters
654 : * arrived with segment.
655 : * Exception: precedence violation. We do not implement it in any case.
656 : */
657 :
658 0 : static void tcp_v4_send_reset(const struct sock *sk, struct sk_buff *skb)
659 : {
660 0 : const struct tcphdr *th = tcp_hdr(skb);
661 0 : struct {
662 : struct tcphdr th;
663 : #ifdef CONFIG_TCP_MD5SIG
664 : __be32 opt[(TCPOLEN_MD5SIG_ALIGNED >> 2)];
665 : #endif
666 : } rep;
667 0 : struct ip_reply_arg arg;
668 : #ifdef CONFIG_TCP_MD5SIG
669 : struct tcp_md5sig_key *key = NULL;
670 : const __u8 *hash_location = NULL;
671 : unsigned char newhash[16];
672 : int genhash;
673 : struct sock *sk1 = NULL;
674 : #endif
675 0 : u64 transmit_time = 0;
676 0 : struct sock *ctl_sk;
677 0 : struct net *net;
678 :
679 : /* Never send a reset in response to a reset. */
680 0 : if (th->rst)
681 0 : return;
682 :
683 : /* If sk not NULL, it means we did a successful lookup and incoming
684 : * route had to be correct. prequeue might have dropped our dst.
685 : */
686 0 : if (!sk && skb_rtable(skb)->rt_type != RTN_LOCAL)
687 : return;
688 :
689 : /* Swap the send and the receive. */
690 0 : memset(&rep, 0, sizeof(rep));
691 0 : rep.th.dest = th->source;
692 0 : rep.th.source = th->dest;
693 0 : rep.th.doff = sizeof(struct tcphdr) / 4;
694 0 : rep.th.rst = 1;
695 :
696 0 : if (th->ack) {
697 0 : rep.th.seq = th->ack_seq;
698 : } else {
699 0 : rep.th.ack = 1;
700 0 : rep.th.ack_seq = htonl(ntohl(th->seq) + th->syn + th->fin +
701 : skb->len - (th->doff << 2));
702 : }
703 :
704 0 : memset(&arg, 0, sizeof(arg));
705 0 : arg.iov[0].iov_base = (unsigned char *)&rep;
706 0 : arg.iov[0].iov_len = sizeof(rep.th);
707 :
708 0 : net = sk ? sock_net(sk) : dev_net(skb_dst(skb)->dev);
709 : #ifdef CONFIG_TCP_MD5SIG
710 : rcu_read_lock();
711 : hash_location = tcp_parse_md5sig_option(th);
712 : if (sk && sk_fullsock(sk)) {
713 : const union tcp_md5_addr *addr;
714 : int l3index;
715 :
716 : /* sdif set, means packet ingressed via a device
717 : * in an L3 domain and inet_iif is set to it.
718 : */
719 : l3index = tcp_v4_sdif(skb) ? inet_iif(skb) : 0;
720 : addr = (union tcp_md5_addr *)&ip_hdr(skb)->saddr;
721 : key = tcp_md5_do_lookup(sk, l3index, addr, AF_INET);
722 : } else if (hash_location) {
723 : const union tcp_md5_addr *addr;
724 : int sdif = tcp_v4_sdif(skb);
725 : int dif = inet_iif(skb);
726 : int l3index;
727 :
728 : /*
729 : * active side is lost. Try to find listening socket through
730 : * source port, and then find md5 key through listening socket.
731 : * we are not loose security here:
732 : * Incoming packet is checked with md5 hash with finding key,
733 : * no RST generated if md5 hash doesn't match.
734 : */
735 : sk1 = __inet_lookup_listener(net, &tcp_hashinfo, NULL, 0,
736 : ip_hdr(skb)->saddr,
737 : th->source, ip_hdr(skb)->daddr,
738 : ntohs(th->source), dif, sdif);
739 : /* don't send rst if it can't find key */
740 : if (!sk1)
741 : goto out;
742 :
743 : /* sdif set, means packet ingressed via a device
744 : * in an L3 domain and dif is set to it.
745 : */
746 : l3index = sdif ? dif : 0;
747 : addr = (union tcp_md5_addr *)&ip_hdr(skb)->saddr;
748 : key = tcp_md5_do_lookup(sk1, l3index, addr, AF_INET);
749 : if (!key)
750 : goto out;
751 :
752 :
753 : genhash = tcp_v4_md5_hash_skb(newhash, key, NULL, skb);
754 : if (genhash || memcmp(hash_location, newhash, 16) != 0)
755 : goto out;
756 :
757 : }
758 :
759 : if (key) {
760 : rep.opt[0] = htonl((TCPOPT_NOP << 24) |
761 : (TCPOPT_NOP << 16) |
762 : (TCPOPT_MD5SIG << 8) |
763 : TCPOLEN_MD5SIG);
764 : /* Update length and the length the header thinks exists */
765 : arg.iov[0].iov_len += TCPOLEN_MD5SIG_ALIGNED;
766 : rep.th.doff = arg.iov[0].iov_len / 4;
767 :
768 : tcp_v4_md5_hash_hdr((__u8 *) &rep.opt[1],
769 : key, ip_hdr(skb)->saddr,
770 : ip_hdr(skb)->daddr, &rep.th);
771 : }
772 : #endif
773 0 : arg.csum = csum_tcpudp_nofold(ip_hdr(skb)->daddr,
774 0 : ip_hdr(skb)->saddr, /* XXX */
775 0 : arg.iov[0].iov_len, IPPROTO_TCP, 0);
776 0 : arg.csumoffset = offsetof(struct tcphdr, check) / 2;
777 0 : arg.flags = (sk && inet_sk_transparent(sk)) ? IP_REPLY_ARG_NOSRCCHECK : 0;
778 :
779 : /* When socket is gone, all binding information is lost.
780 : * routing might fail in this case. No choice here, if we choose to force
781 : * input interface, we will misroute in case of asymmetric route.
782 : */
783 0 : if (sk) {
784 0 : arg.bound_dev_if = sk->sk_bound_dev_if;
785 0 : if (sk_fullsock(sk))
786 0 : trace_tcp_send_reset(sk, skb);
787 : }
788 :
789 0 : BUILD_BUG_ON(offsetof(struct sock, sk_bound_dev_if) !=
790 : offsetof(struct inet_timewait_sock, tw_bound_dev_if));
791 :
792 0 : arg.tos = ip_hdr(skb)->tos;
793 0 : arg.uid = sock_net_uid(net, sk && sk_fullsock(sk) ? sk : NULL);
794 0 : local_bh_disable();
795 0 : ctl_sk = this_cpu_read(*net->ipv4.tcp_sk);
796 0 : if (sk) {
797 0 : ctl_sk->sk_mark = (sk->sk_state == TCP_TIME_WAIT) ?
798 0 : inet_twsk(sk)->tw_mark : sk->sk_mark;
799 0 : ctl_sk->sk_priority = (sk->sk_state == TCP_TIME_WAIT) ?
800 0 : inet_twsk(sk)->tw_priority : sk->sk_priority;
801 0 : transmit_time = tcp_transmit_time(sk);
802 : }
803 0 : ip_send_unicast_reply(ctl_sk,
804 0 : skb, &TCP_SKB_CB(skb)->header.h4.opt,
805 0 : ip_hdr(skb)->saddr, ip_hdr(skb)->daddr,
806 0 : &arg, arg.iov[0].iov_len,
807 : transmit_time);
808 :
809 0 : ctl_sk->sk_mark = 0;
810 0 : __TCP_INC_STATS(net, TCP_MIB_OUTSEGS);
811 0 : __TCP_INC_STATS(net, TCP_MIB_OUTRSTS);
812 0 : local_bh_enable();
813 :
814 : #ifdef CONFIG_TCP_MD5SIG
815 : out:
816 : rcu_read_unlock();
817 : #endif
818 : }
819 :
820 : /* The code following below sending ACKs in SYN-RECV and TIME-WAIT states
821 : outside socket context is ugly, certainly. What can I do?
822 : */
823 :
824 0 : static void tcp_v4_send_ack(const struct sock *sk,
825 : struct sk_buff *skb, u32 seq, u32 ack,
826 : u32 win, u32 tsval, u32 tsecr, int oif,
827 : struct tcp_md5sig_key *key,
828 : int reply_flags, u8 tos)
829 : {
830 0 : const struct tcphdr *th = tcp_hdr(skb);
831 0 : struct {
832 : struct tcphdr th;
833 : __be32 opt[(TCPOLEN_TSTAMP_ALIGNED >> 2)
834 : #ifdef CONFIG_TCP_MD5SIG
835 : + (TCPOLEN_MD5SIG_ALIGNED >> 2)
836 : #endif
837 : ];
838 : } rep;
839 0 : struct net *net = sock_net(sk);
840 0 : struct ip_reply_arg arg;
841 0 : struct sock *ctl_sk;
842 0 : u64 transmit_time;
843 :
844 0 : memset(&rep.th, 0, sizeof(struct tcphdr));
845 0 : memset(&arg, 0, sizeof(arg));
846 :
847 0 : arg.iov[0].iov_base = (unsigned char *)&rep;
848 0 : arg.iov[0].iov_len = sizeof(rep.th);
849 0 : if (tsecr) {
850 0 : rep.opt[0] = htonl((TCPOPT_NOP << 24) | (TCPOPT_NOP << 16) |
851 : (TCPOPT_TIMESTAMP << 8) |
852 : TCPOLEN_TIMESTAMP);
853 0 : rep.opt[1] = htonl(tsval);
854 0 : rep.opt[2] = htonl(tsecr);
855 0 : arg.iov[0].iov_len += TCPOLEN_TSTAMP_ALIGNED;
856 : }
857 :
858 : /* Swap the send and the receive. */
859 0 : rep.th.dest = th->source;
860 0 : rep.th.source = th->dest;
861 0 : rep.th.doff = arg.iov[0].iov_len / 4;
862 0 : rep.th.seq = htonl(seq);
863 0 : rep.th.ack_seq = htonl(ack);
864 0 : rep.th.ack = 1;
865 0 : rep.th.window = htons(win);
866 :
867 : #ifdef CONFIG_TCP_MD5SIG
868 : if (key) {
869 : int offset = (tsecr) ? 3 : 0;
870 :
871 : rep.opt[offset++] = htonl((TCPOPT_NOP << 24) |
872 : (TCPOPT_NOP << 16) |
873 : (TCPOPT_MD5SIG << 8) |
874 : TCPOLEN_MD5SIG);
875 : arg.iov[0].iov_len += TCPOLEN_MD5SIG_ALIGNED;
876 : rep.th.doff = arg.iov[0].iov_len/4;
877 :
878 : tcp_v4_md5_hash_hdr((__u8 *) &rep.opt[offset],
879 : key, ip_hdr(skb)->saddr,
880 : ip_hdr(skb)->daddr, &rep.th);
881 : }
882 : #endif
883 0 : arg.flags = reply_flags;
884 0 : arg.csum = csum_tcpudp_nofold(ip_hdr(skb)->daddr,
885 0 : ip_hdr(skb)->saddr, /* XXX */
886 : arg.iov[0].iov_len, IPPROTO_TCP, 0);
887 0 : arg.csumoffset = offsetof(struct tcphdr, check) / 2;
888 0 : if (oif)
889 0 : arg.bound_dev_if = oif;
890 0 : arg.tos = tos;
891 0 : arg.uid = sock_net_uid(net, sk_fullsock(sk) ? sk : NULL);
892 0 : local_bh_disable();
893 0 : ctl_sk = this_cpu_read(*net->ipv4.tcp_sk);
894 0 : ctl_sk->sk_mark = (sk->sk_state == TCP_TIME_WAIT) ?
895 0 : inet_twsk(sk)->tw_mark : sk->sk_mark;
896 0 : ctl_sk->sk_priority = (sk->sk_state == TCP_TIME_WAIT) ?
897 0 : inet_twsk(sk)->tw_priority : sk->sk_priority;
898 0 : transmit_time = tcp_transmit_time(sk);
899 0 : ip_send_unicast_reply(ctl_sk,
900 0 : skb, &TCP_SKB_CB(skb)->header.h4.opt,
901 0 : ip_hdr(skb)->saddr, ip_hdr(skb)->daddr,
902 0 : &arg, arg.iov[0].iov_len,
903 : transmit_time);
904 :
905 0 : ctl_sk->sk_mark = 0;
906 0 : __TCP_INC_STATS(net, TCP_MIB_OUTSEGS);
907 0 : local_bh_enable();
908 0 : }
909 :
910 0 : static void tcp_v4_timewait_ack(struct sock *sk, struct sk_buff *skb)
911 : {
912 0 : struct inet_timewait_sock *tw = inet_twsk(sk);
913 0 : struct tcp_timewait_sock *tcptw = tcp_twsk(sk);
914 :
915 0 : tcp_v4_send_ack(sk, skb,
916 : tcptw->tw_snd_nxt, tcptw->tw_rcv_nxt,
917 0 : tcptw->tw_rcv_wnd >> tw->tw_rcv_wscale,
918 0 : tcp_time_stamp_raw() + tcptw->tw_ts_offset,
919 : tcptw->tw_ts_recent,
920 : tw->tw_bound_dev_if,
921 : tcp_twsk_md5_key(tcptw),
922 0 : tw->tw_transparent ? IP_REPLY_ARG_NOSRCCHECK : 0,
923 0 : tw->tw_tos
924 : );
925 :
926 0 : inet_twsk_put(tw);
927 0 : }
928 :
929 0 : static void tcp_v4_reqsk_send_ack(const struct sock *sk, struct sk_buff *skb,
930 : struct request_sock *req)
931 : {
932 0 : const union tcp_md5_addr *addr;
933 0 : int l3index;
934 :
935 : /* sk->sk_state == TCP_LISTEN -> for regular TCP_SYN_RECV
936 : * sk->sk_state == TCP_SYN_RECV -> for Fast Open.
937 : */
938 0 : u32 seq = (sk->sk_state == TCP_LISTEN) ? tcp_rsk(req)->snt_isn + 1 :
939 0 : tcp_sk(sk)->snd_nxt;
940 :
941 : /* RFC 7323 2.3
942 : * The window field (SEG.WND) of every outgoing segment, with the
943 : * exception of <SYN> segments, MUST be right-shifted by
944 : * Rcv.Wind.Shift bits:
945 : */
946 0 : addr = (union tcp_md5_addr *)&ip_hdr(skb)->saddr;
947 0 : l3index = tcp_v4_sdif(skb) ? inet_iif(skb) : 0;
948 0 : tcp_v4_send_ack(sk, skb, seq,
949 0 : tcp_rsk(req)->rcv_nxt,
950 0 : req->rsk_rcv_wnd >> inet_rsk(req)->rcv_wscale,
951 0 : tcp_time_stamp_raw() + tcp_rsk(req)->ts_off,
952 : req->ts_recent,
953 : 0,
954 : tcp_md5_do_lookup(sk, l3index, addr, AF_INET),
955 0 : inet_rsk(req)->no_srccheck ? IP_REPLY_ARG_NOSRCCHECK : 0,
956 0 : ip_hdr(skb)->tos);
957 0 : }
958 :
959 : /*
960 : * Send a SYN-ACK after having received a SYN.
961 : * This still operates on a request_sock only, not on a big
962 : * socket.
963 : */
964 4 : static int tcp_v4_send_synack(const struct sock *sk, struct dst_entry *dst,
965 : struct flowi *fl,
966 : struct request_sock *req,
967 : struct tcp_fastopen_cookie *foc,
968 : enum tcp_synack_type synack_type,
969 : struct sk_buff *syn_skb)
970 : {
971 4 : const struct inet_request_sock *ireq = inet_rsk(req);
972 4 : struct flowi4 fl4;
973 4 : int err = -1;
974 4 : struct sk_buff *skb;
975 4 : u8 tos;
976 :
977 : /* First, grab a route. */
978 4 : if (!dst && (dst = inet_csk_route_req(sk, &fl4, req)) == NULL)
979 : return -1;
980 :
981 4 : skb = tcp_make_synack(sk, dst, req, foc, synack_type, syn_skb);
982 :
983 4 : if (skb) {
984 4 : __tcp_v4_send_check(skb, ireq->ir_loc_addr, ireq->ir_rmt_addr);
985 :
986 4 : tos = sock_net(sk)->ipv4.sysctl_tcp_reflect_tos ?
987 0 : (tcp_rsk(req)->syn_tos & ~INET_ECN_MASK) |
988 0 : (inet_sk(sk)->tos & INET_ECN_MASK) :
989 4 : inet_sk(sk)->tos;
990 :
991 8 : if (!INET_ECN_is_capable(tos) &&
992 4 : tcp_bpf_ca_needs_ecn((struct sock *)req))
993 0 : tos |= INET_ECN_ECT_0;
994 :
995 4 : rcu_read_lock();
996 12 : err = ip_build_and_send_pkt(skb, sk, ireq->ir_loc_addr,
997 : ireq->ir_rmt_addr,
998 4 : rcu_dereference(ireq->ireq_opt),
999 : tos);
1000 4 : rcu_read_unlock();
1001 4 : err = net_xmit_eval(err);
1002 : }
1003 :
1004 : return err;
1005 : }
1006 :
1007 : /*
1008 : * IPv4 request_sock destructor.
1009 : */
1010 4 : static void tcp_v4_reqsk_destructor(struct request_sock *req)
1011 : {
1012 4 : kfree(rcu_dereference_protected(inet_rsk(req)->ireq_opt, 1));
1013 4 : }
1014 :
1015 : #ifdef CONFIG_TCP_MD5SIG
1016 : /*
1017 : * RFC2385 MD5 checksumming requires a mapping of
1018 : * IP address->MD5 Key.
1019 : * We need to maintain these in the sk structure.
1020 : */
1021 :
1022 : DEFINE_STATIC_KEY_FALSE(tcp_md5_needed);
1023 : EXPORT_SYMBOL(tcp_md5_needed);
1024 :
1025 : /* Find the Key structure for an address. */
1026 : struct tcp_md5sig_key *__tcp_md5_do_lookup(const struct sock *sk, int l3index,
1027 : const union tcp_md5_addr *addr,
1028 : int family)
1029 : {
1030 : const struct tcp_sock *tp = tcp_sk(sk);
1031 : struct tcp_md5sig_key *key;
1032 : const struct tcp_md5sig_info *md5sig;
1033 : __be32 mask;
1034 : struct tcp_md5sig_key *best_match = NULL;
1035 : bool match;
1036 :
1037 : /* caller either holds rcu_read_lock() or socket lock */
1038 : md5sig = rcu_dereference_check(tp->md5sig_info,
1039 : lockdep_sock_is_held(sk));
1040 : if (!md5sig)
1041 : return NULL;
1042 :
1043 : hlist_for_each_entry_rcu(key, &md5sig->head, node,
1044 : lockdep_sock_is_held(sk)) {
1045 : if (key->family != family)
1046 : continue;
1047 : if (key->l3index && key->l3index != l3index)
1048 : continue;
1049 : if (family == AF_INET) {
1050 : mask = inet_make_mask(key->prefixlen);
1051 : match = (key->addr.a4.s_addr & mask) ==
1052 : (addr->a4.s_addr & mask);
1053 : #if IS_ENABLED(CONFIG_IPV6)
1054 : } else if (family == AF_INET6) {
1055 : match = ipv6_prefix_equal(&key->addr.a6, &addr->a6,
1056 : key->prefixlen);
1057 : #endif
1058 : } else {
1059 : match = false;
1060 : }
1061 :
1062 : if (match && (!best_match ||
1063 : key->prefixlen > best_match->prefixlen))
1064 : best_match = key;
1065 : }
1066 : return best_match;
1067 : }
1068 : EXPORT_SYMBOL(__tcp_md5_do_lookup);
1069 :
1070 : static struct tcp_md5sig_key *tcp_md5_do_lookup_exact(const struct sock *sk,
1071 : const union tcp_md5_addr *addr,
1072 : int family, u8 prefixlen,
1073 : int l3index)
1074 : {
1075 : const struct tcp_sock *tp = tcp_sk(sk);
1076 : struct tcp_md5sig_key *key;
1077 : unsigned int size = sizeof(struct in_addr);
1078 : const struct tcp_md5sig_info *md5sig;
1079 :
1080 : /* caller either holds rcu_read_lock() or socket lock */
1081 : md5sig = rcu_dereference_check(tp->md5sig_info,
1082 : lockdep_sock_is_held(sk));
1083 : if (!md5sig)
1084 : return NULL;
1085 : #if IS_ENABLED(CONFIG_IPV6)
1086 : if (family == AF_INET6)
1087 : size = sizeof(struct in6_addr);
1088 : #endif
1089 : hlist_for_each_entry_rcu(key, &md5sig->head, node,
1090 : lockdep_sock_is_held(sk)) {
1091 : if (key->family != family)
1092 : continue;
1093 : if (key->l3index && key->l3index != l3index)
1094 : continue;
1095 : if (!memcmp(&key->addr, addr, size) &&
1096 : key->prefixlen == prefixlen)
1097 : return key;
1098 : }
1099 : return NULL;
1100 : }
1101 :
1102 : struct tcp_md5sig_key *tcp_v4_md5_lookup(const struct sock *sk,
1103 : const struct sock *addr_sk)
1104 : {
1105 : const union tcp_md5_addr *addr;
1106 : int l3index;
1107 :
1108 : l3index = l3mdev_master_ifindex_by_index(sock_net(sk),
1109 : addr_sk->sk_bound_dev_if);
1110 : addr = (const union tcp_md5_addr *)&addr_sk->sk_daddr;
1111 : return tcp_md5_do_lookup(sk, l3index, addr, AF_INET);
1112 : }
1113 : EXPORT_SYMBOL(tcp_v4_md5_lookup);
1114 :
1115 : /* This can be called on a newly created socket, from other files */
1116 : int tcp_md5_do_add(struct sock *sk, const union tcp_md5_addr *addr,
1117 : int family, u8 prefixlen, int l3index,
1118 : const u8 *newkey, u8 newkeylen, gfp_t gfp)
1119 : {
1120 : /* Add Key to the list */
1121 : struct tcp_md5sig_key *key;
1122 : struct tcp_sock *tp = tcp_sk(sk);
1123 : struct tcp_md5sig_info *md5sig;
1124 :
1125 : key = tcp_md5_do_lookup_exact(sk, addr, family, prefixlen, l3index);
1126 : if (key) {
1127 : /* Pre-existing entry - just update that one.
1128 : * Note that the key might be used concurrently.
1129 : * data_race() is telling kcsan that we do not care of
1130 : * key mismatches, since changing MD5 key on live flows
1131 : * can lead to packet drops.
1132 : */
1133 : data_race(memcpy(key->key, newkey, newkeylen));
1134 :
1135 : /* Pairs with READ_ONCE() in tcp_md5_hash_key().
1136 : * Also note that a reader could catch new key->keylen value
1137 : * but old key->key[], this is the reason we use __GFP_ZERO
1138 : * at sock_kmalloc() time below these lines.
1139 : */
1140 : WRITE_ONCE(key->keylen, newkeylen);
1141 :
1142 : return 0;
1143 : }
1144 :
1145 : md5sig = rcu_dereference_protected(tp->md5sig_info,
1146 : lockdep_sock_is_held(sk));
1147 : if (!md5sig) {
1148 : md5sig = kmalloc(sizeof(*md5sig), gfp);
1149 : if (!md5sig)
1150 : return -ENOMEM;
1151 :
1152 : sk_nocaps_add(sk, NETIF_F_GSO_MASK);
1153 : INIT_HLIST_HEAD(&md5sig->head);
1154 : rcu_assign_pointer(tp->md5sig_info, md5sig);
1155 : }
1156 :
1157 : key = sock_kmalloc(sk, sizeof(*key), gfp | __GFP_ZERO);
1158 : if (!key)
1159 : return -ENOMEM;
1160 : if (!tcp_alloc_md5sig_pool()) {
1161 : sock_kfree_s(sk, key, sizeof(*key));
1162 : return -ENOMEM;
1163 : }
1164 :
1165 : memcpy(key->key, newkey, newkeylen);
1166 : key->keylen = newkeylen;
1167 : key->family = family;
1168 : key->prefixlen = prefixlen;
1169 : key->l3index = l3index;
1170 : memcpy(&key->addr, addr,
1171 : (family == AF_INET6) ? sizeof(struct in6_addr) :
1172 : sizeof(struct in_addr));
1173 : hlist_add_head_rcu(&key->node, &md5sig->head);
1174 : return 0;
1175 : }
1176 : EXPORT_SYMBOL(tcp_md5_do_add);
1177 :
1178 : int tcp_md5_do_del(struct sock *sk, const union tcp_md5_addr *addr, int family,
1179 : u8 prefixlen, int l3index)
1180 : {
1181 : struct tcp_md5sig_key *key;
1182 :
1183 : key = tcp_md5_do_lookup_exact(sk, addr, family, prefixlen, l3index);
1184 : if (!key)
1185 : return -ENOENT;
1186 : hlist_del_rcu(&key->node);
1187 : atomic_sub(sizeof(*key), &sk->sk_omem_alloc);
1188 : kfree_rcu(key, rcu);
1189 : return 0;
1190 : }
1191 : EXPORT_SYMBOL(tcp_md5_do_del);
1192 :
1193 : static void tcp_clear_md5_list(struct sock *sk)
1194 : {
1195 : struct tcp_sock *tp = tcp_sk(sk);
1196 : struct tcp_md5sig_key *key;
1197 : struct hlist_node *n;
1198 : struct tcp_md5sig_info *md5sig;
1199 :
1200 : md5sig = rcu_dereference_protected(tp->md5sig_info, 1);
1201 :
1202 : hlist_for_each_entry_safe(key, n, &md5sig->head, node) {
1203 : hlist_del_rcu(&key->node);
1204 : atomic_sub(sizeof(*key), &sk->sk_omem_alloc);
1205 : kfree_rcu(key, rcu);
1206 : }
1207 : }
1208 :
1209 : static int tcp_v4_parse_md5_keys(struct sock *sk, int optname,
1210 : sockptr_t optval, int optlen)
1211 : {
1212 : struct tcp_md5sig cmd;
1213 : struct sockaddr_in *sin = (struct sockaddr_in *)&cmd.tcpm_addr;
1214 : const union tcp_md5_addr *addr;
1215 : u8 prefixlen = 32;
1216 : int l3index = 0;
1217 :
1218 : if (optlen < sizeof(cmd))
1219 : return -EINVAL;
1220 :
1221 : if (copy_from_sockptr(&cmd, optval, sizeof(cmd)))
1222 : return -EFAULT;
1223 :
1224 : if (sin->sin_family != AF_INET)
1225 : return -EINVAL;
1226 :
1227 : if (optname == TCP_MD5SIG_EXT &&
1228 : cmd.tcpm_flags & TCP_MD5SIG_FLAG_PREFIX) {
1229 : prefixlen = cmd.tcpm_prefixlen;
1230 : if (prefixlen > 32)
1231 : return -EINVAL;
1232 : }
1233 :
1234 : if (optname == TCP_MD5SIG_EXT &&
1235 : cmd.tcpm_flags & TCP_MD5SIG_FLAG_IFINDEX) {
1236 : struct net_device *dev;
1237 :
1238 : rcu_read_lock();
1239 : dev = dev_get_by_index_rcu(sock_net(sk), cmd.tcpm_ifindex);
1240 : if (dev && netif_is_l3_master(dev))
1241 : l3index = dev->ifindex;
1242 :
1243 : rcu_read_unlock();
1244 :
1245 : /* ok to reference set/not set outside of rcu;
1246 : * right now device MUST be an L3 master
1247 : */
1248 : if (!dev || !l3index)
1249 : return -EINVAL;
1250 : }
1251 :
1252 : addr = (union tcp_md5_addr *)&sin->sin_addr.s_addr;
1253 :
1254 : if (!cmd.tcpm_keylen)
1255 : return tcp_md5_do_del(sk, addr, AF_INET, prefixlen, l3index);
1256 :
1257 : if (cmd.tcpm_keylen > TCP_MD5SIG_MAXKEYLEN)
1258 : return -EINVAL;
1259 :
1260 : return tcp_md5_do_add(sk, addr, AF_INET, prefixlen, l3index,
1261 : cmd.tcpm_key, cmd.tcpm_keylen, GFP_KERNEL);
1262 : }
1263 :
1264 : static int tcp_v4_md5_hash_headers(struct tcp_md5sig_pool *hp,
1265 : __be32 daddr, __be32 saddr,
1266 : const struct tcphdr *th, int nbytes)
1267 : {
1268 : struct tcp4_pseudohdr *bp;
1269 : struct scatterlist sg;
1270 : struct tcphdr *_th;
1271 :
1272 : bp = hp->scratch;
1273 : bp->saddr = saddr;
1274 : bp->daddr = daddr;
1275 : bp->pad = 0;
1276 : bp->protocol = IPPROTO_TCP;
1277 : bp->len = cpu_to_be16(nbytes);
1278 :
1279 : _th = (struct tcphdr *)(bp + 1);
1280 : memcpy(_th, th, sizeof(*th));
1281 : _th->check = 0;
1282 :
1283 : sg_init_one(&sg, bp, sizeof(*bp) + sizeof(*th));
1284 : ahash_request_set_crypt(hp->md5_req, &sg, NULL,
1285 : sizeof(*bp) + sizeof(*th));
1286 : return crypto_ahash_update(hp->md5_req);
1287 : }
1288 :
1289 : static int tcp_v4_md5_hash_hdr(char *md5_hash, const struct tcp_md5sig_key *key,
1290 : __be32 daddr, __be32 saddr, const struct tcphdr *th)
1291 : {
1292 : struct tcp_md5sig_pool *hp;
1293 : struct ahash_request *req;
1294 :
1295 : hp = tcp_get_md5sig_pool();
1296 : if (!hp)
1297 : goto clear_hash_noput;
1298 : req = hp->md5_req;
1299 :
1300 : if (crypto_ahash_init(req))
1301 : goto clear_hash;
1302 : if (tcp_v4_md5_hash_headers(hp, daddr, saddr, th, th->doff << 2))
1303 : goto clear_hash;
1304 : if (tcp_md5_hash_key(hp, key))
1305 : goto clear_hash;
1306 : ahash_request_set_crypt(req, NULL, md5_hash, 0);
1307 : if (crypto_ahash_final(req))
1308 : goto clear_hash;
1309 :
1310 : tcp_put_md5sig_pool();
1311 : return 0;
1312 :
1313 : clear_hash:
1314 : tcp_put_md5sig_pool();
1315 : clear_hash_noput:
1316 : memset(md5_hash, 0, 16);
1317 : return 1;
1318 : }
1319 :
1320 : int tcp_v4_md5_hash_skb(char *md5_hash, const struct tcp_md5sig_key *key,
1321 : const struct sock *sk,
1322 : const struct sk_buff *skb)
1323 : {
1324 : struct tcp_md5sig_pool *hp;
1325 : struct ahash_request *req;
1326 : const struct tcphdr *th = tcp_hdr(skb);
1327 : __be32 saddr, daddr;
1328 :
1329 : if (sk) { /* valid for establish/request sockets */
1330 : saddr = sk->sk_rcv_saddr;
1331 : daddr = sk->sk_daddr;
1332 : } else {
1333 : const struct iphdr *iph = ip_hdr(skb);
1334 : saddr = iph->saddr;
1335 : daddr = iph->daddr;
1336 : }
1337 :
1338 : hp = tcp_get_md5sig_pool();
1339 : if (!hp)
1340 : goto clear_hash_noput;
1341 : req = hp->md5_req;
1342 :
1343 : if (crypto_ahash_init(req))
1344 : goto clear_hash;
1345 :
1346 : if (tcp_v4_md5_hash_headers(hp, daddr, saddr, th, skb->len))
1347 : goto clear_hash;
1348 : if (tcp_md5_hash_skb_data(hp, skb, th->doff << 2))
1349 : goto clear_hash;
1350 : if (tcp_md5_hash_key(hp, key))
1351 : goto clear_hash;
1352 : ahash_request_set_crypt(req, NULL, md5_hash, 0);
1353 : if (crypto_ahash_final(req))
1354 : goto clear_hash;
1355 :
1356 : tcp_put_md5sig_pool();
1357 : return 0;
1358 :
1359 : clear_hash:
1360 : tcp_put_md5sig_pool();
1361 : clear_hash_noput:
1362 : memset(md5_hash, 0, 16);
1363 : return 1;
1364 : }
1365 : EXPORT_SYMBOL(tcp_v4_md5_hash_skb);
1366 :
1367 : #endif
1368 :
1369 : /* Called with rcu_read_lock() */
1370 438 : static bool tcp_v4_inbound_md5_hash(const struct sock *sk,
1371 : const struct sk_buff *skb,
1372 : int dif, int sdif)
1373 : {
1374 : #ifdef CONFIG_TCP_MD5SIG
1375 : /*
1376 : * This gets called for each TCP segment that arrives
1377 : * so we want to be efficient.
1378 : * We have 3 drop cases:
1379 : * o No MD5 hash and one expected.
1380 : * o MD5 hash and we're not expecting one.
1381 : * o MD5 hash and its wrong.
1382 : */
1383 : const __u8 *hash_location = NULL;
1384 : struct tcp_md5sig_key *hash_expected;
1385 : const struct iphdr *iph = ip_hdr(skb);
1386 : const struct tcphdr *th = tcp_hdr(skb);
1387 : const union tcp_md5_addr *addr;
1388 : unsigned char newhash[16];
1389 : int genhash, l3index;
1390 :
1391 : /* sdif set, means packet ingressed via a device
1392 : * in an L3 domain and dif is set to the l3mdev
1393 : */
1394 : l3index = sdif ? dif : 0;
1395 :
1396 : addr = (union tcp_md5_addr *)&iph->saddr;
1397 : hash_expected = tcp_md5_do_lookup(sk, l3index, addr, AF_INET);
1398 : hash_location = tcp_parse_md5sig_option(th);
1399 :
1400 : /* We've parsed the options - do we have a hash? */
1401 : if (!hash_expected && !hash_location)
1402 : return false;
1403 :
1404 : if (hash_expected && !hash_location) {
1405 : NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPMD5NOTFOUND);
1406 : return true;
1407 : }
1408 :
1409 : if (!hash_expected && hash_location) {
1410 : NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPMD5UNEXPECTED);
1411 : return true;
1412 : }
1413 :
1414 : /* Okay, so this is hash_expected and hash_location -
1415 : * so we need to calculate the checksum.
1416 : */
1417 : genhash = tcp_v4_md5_hash_skb(newhash,
1418 : hash_expected,
1419 : NULL, skb);
1420 :
1421 : if (genhash || memcmp(hash_location, newhash, 16) != 0) {
1422 : NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPMD5FAILURE);
1423 : net_info_ratelimited("MD5 Hash failed for (%pI4, %d)->(%pI4, %d)%s L3 index %d\n",
1424 : &iph->saddr, ntohs(th->source),
1425 : &iph->daddr, ntohs(th->dest),
1426 : genhash ? " tcp_v4_calc_md5_hash failed"
1427 : : "", l3index);
1428 : return true;
1429 : }
1430 : return false;
1431 : #endif
1432 438 : return false;
1433 : }
1434 :
1435 4 : static void tcp_v4_init_req(struct request_sock *req,
1436 : const struct sock *sk_listener,
1437 : struct sk_buff *skb)
1438 : {
1439 4 : struct inet_request_sock *ireq = inet_rsk(req);
1440 4 : struct net *net = sock_net(sk_listener);
1441 :
1442 4 : sk_rcv_saddr_set(req_to_sk(req), ip_hdr(skb)->daddr);
1443 4 : sk_daddr_set(req_to_sk(req), ip_hdr(skb)->saddr);
1444 4 : RCU_INIT_POINTER(ireq->ireq_opt, tcp_v4_save_options(net, skb));
1445 4 : }
1446 :
1447 4 : static struct dst_entry *tcp_v4_route_req(const struct sock *sk,
1448 : struct sk_buff *skb,
1449 : struct flowi *fl,
1450 : struct request_sock *req)
1451 : {
1452 4 : tcp_v4_init_req(req, sk, skb);
1453 :
1454 4 : if (security_inet_conn_request(sk, skb, req))
1455 : return NULL;
1456 :
1457 4 : return inet_csk_route_req(sk, &fl->u.ip4, req);
1458 : }
1459 :
1460 : struct request_sock_ops tcp_request_sock_ops __read_mostly = {
1461 : .family = PF_INET,
1462 : .obj_size = sizeof(struct tcp_request_sock),
1463 : .rtx_syn_ack = tcp_rtx_synack,
1464 : .send_ack = tcp_v4_reqsk_send_ack,
1465 : .destructor = tcp_v4_reqsk_destructor,
1466 : .send_reset = tcp_v4_send_reset,
1467 : .syn_ack_timeout = tcp_syn_ack_timeout,
1468 : };
1469 :
1470 : const struct tcp_request_sock_ops tcp_request_sock_ipv4_ops = {
1471 : .mss_clamp = TCP_MSS_DEFAULT,
1472 : #ifdef CONFIG_TCP_MD5SIG
1473 : .req_md5_lookup = tcp_v4_md5_lookup,
1474 : .calc_md5_hash = tcp_v4_md5_hash_skb,
1475 : #endif
1476 : #ifdef CONFIG_SYN_COOKIES
1477 : .cookie_init_seq = cookie_v4_init_sequence,
1478 : #endif
1479 : .route_req = tcp_v4_route_req,
1480 : .init_seq = tcp_v4_init_seq,
1481 : .init_ts_off = tcp_v4_init_ts_off,
1482 : .send_synack = tcp_v4_send_synack,
1483 : };
1484 :
1485 4 : int tcp_v4_conn_request(struct sock *sk, struct sk_buff *skb)
1486 : {
1487 : /* Never answer to SYNs send to broadcast or multicast */
1488 4 : if (skb_rtable(skb)->rt_flags & (RTCF_BROADCAST | RTCF_MULTICAST))
1489 0 : goto drop;
1490 :
1491 4 : return tcp_conn_request(&tcp_request_sock_ops,
1492 : &tcp_request_sock_ipv4_ops, sk, skb);
1493 :
1494 0 : drop:
1495 0 : tcp_listendrop(sk);
1496 0 : return 0;
1497 : }
1498 : EXPORT_SYMBOL(tcp_v4_conn_request);
1499 :
1500 :
1501 : /*
1502 : * The three way handshake has completed - we got a valid synack -
1503 : * now create the new socket.
1504 : */
1505 4 : struct sock *tcp_v4_syn_recv_sock(const struct sock *sk, struct sk_buff *skb,
1506 : struct request_sock *req,
1507 : struct dst_entry *dst,
1508 : struct request_sock *req_unhash,
1509 : bool *own_req)
1510 : {
1511 4 : struct inet_request_sock *ireq;
1512 4 : bool found_dup_sk = false;
1513 4 : struct inet_sock *newinet;
1514 4 : struct tcp_sock *newtp;
1515 4 : struct sock *newsk;
1516 : #ifdef CONFIG_TCP_MD5SIG
1517 : const union tcp_md5_addr *addr;
1518 : struct tcp_md5sig_key *key;
1519 : int l3index;
1520 : #endif
1521 4 : struct ip_options_rcu *inet_opt;
1522 :
1523 4 : if (sk_acceptq_is_full(sk))
1524 0 : goto exit_overflow;
1525 :
1526 4 : newsk = tcp_create_openreq_child(sk, req, skb);
1527 4 : if (!newsk)
1528 0 : goto exit_nonewsk;
1529 :
1530 4 : newsk->sk_gso_type = SKB_GSO_TCPV4;
1531 4 : inet_sk_rx_dst_set(newsk, skb);
1532 :
1533 4 : newtp = tcp_sk(newsk);
1534 4 : newinet = inet_sk(newsk);
1535 4 : ireq = inet_rsk(req);
1536 4 : sk_daddr_set(newsk, ireq->ir_rmt_addr);
1537 4 : sk_rcv_saddr_set(newsk, ireq->ir_loc_addr);
1538 4 : newsk->sk_bound_dev_if = ireq->ir_iif;
1539 4 : newinet->inet_saddr = ireq->ir_loc_addr;
1540 4 : inet_opt = rcu_dereference(ireq->ireq_opt);
1541 4 : RCU_INIT_POINTER(newinet->inet_opt, inet_opt);
1542 4 : newinet->mc_index = inet_iif(skb);
1543 4 : newinet->mc_ttl = ip_hdr(skb)->ttl;
1544 4 : newinet->rcv_tos = ip_hdr(skb)->tos;
1545 4 : inet_csk(newsk)->icsk_ext_hdr_len = 0;
1546 4 : if (inet_opt)
1547 0 : inet_csk(newsk)->icsk_ext_hdr_len = inet_opt->opt.optlen;
1548 4 : newinet->inet_id = prandom_u32();
1549 :
1550 : /* Set ToS of the new socket based upon the value of incoming SYN.
1551 : * ECT bits are set later in tcp_init_transfer().
1552 : */
1553 4 : if (sock_net(sk)->ipv4.sysctl_tcp_reflect_tos)
1554 0 : newinet->tos = tcp_rsk(req)->syn_tos & ~INET_ECN_MASK;
1555 :
1556 4 : if (!dst) {
1557 4 : dst = inet_csk_route_child_sock(sk, newsk, req);
1558 4 : if (!dst)
1559 0 : goto put_and_exit;
1560 : } else {
1561 : /* syncookie case : see end of cookie_v4_check() */
1562 4 : }
1563 4 : sk_setup_caps(newsk, dst);
1564 :
1565 4 : tcp_ca_openreq_child(newsk, dst);
1566 :
1567 4 : tcp_sync_mss(newsk, dst_mtu(dst));
1568 4 : newtp->advmss = tcp_mss_clamp(tcp_sk(sk), dst_metric_advmss(dst));
1569 :
1570 4 : tcp_initialize_rcv_mss(newsk);
1571 :
1572 : #ifdef CONFIG_TCP_MD5SIG
1573 : l3index = l3mdev_master_ifindex_by_index(sock_net(sk), ireq->ir_iif);
1574 : /* Copy over the MD5 key from the original socket */
1575 : addr = (union tcp_md5_addr *)&newinet->inet_daddr;
1576 : key = tcp_md5_do_lookup(sk, l3index, addr, AF_INET);
1577 : if (key) {
1578 : /*
1579 : * We're using one, so create a matching key
1580 : * on the newsk structure. If we fail to get
1581 : * memory, then we end up not copying the key
1582 : * across. Shucks.
1583 : */
1584 : tcp_md5_do_add(newsk, addr, AF_INET, 32, l3index,
1585 : key->key, key->keylen, GFP_ATOMIC);
1586 : sk_nocaps_add(newsk, NETIF_F_GSO_MASK);
1587 : }
1588 : #endif
1589 :
1590 4 : if (__inet_inherit_port(sk, newsk) < 0)
1591 0 : goto put_and_exit;
1592 4 : *own_req = inet_ehash_nolisten(newsk, req_to_sk(req_unhash),
1593 : &found_dup_sk);
1594 4 : if (likely(*own_req)) {
1595 4 : tcp_move_syn(newtp, req);
1596 4 : ireq->ireq_opt = NULL;
1597 : } else {
1598 0 : newinet->inet_opt = NULL;
1599 :
1600 0 : if (!req_unhash && found_dup_sk) {
1601 : /* This code path should only be executed in the
1602 : * syncookie case only
1603 : */
1604 0 : bh_unlock_sock(newsk);
1605 0 : sock_put(newsk);
1606 0 : newsk = NULL;
1607 : }
1608 : }
1609 : return newsk;
1610 :
1611 0 : exit_overflow:
1612 0 : NET_INC_STATS(sock_net(sk), LINUX_MIB_LISTENOVERFLOWS);
1613 0 : exit_nonewsk:
1614 0 : dst_release(dst);
1615 0 : exit:
1616 0 : tcp_listendrop(sk);
1617 0 : return NULL;
1618 0 : put_and_exit:
1619 0 : newinet->inet_opt = NULL;
1620 0 : inet_csk_prepare_forced_close(newsk);
1621 0 : tcp_done(newsk);
1622 0 : goto exit;
1623 : }
1624 : EXPORT_SYMBOL(tcp_v4_syn_recv_sock);
1625 :
1626 4 : static struct sock *tcp_v4_cookie_check(struct sock *sk, struct sk_buff *skb)
1627 : {
1628 : #ifdef CONFIG_SYN_COOKIES
1629 : const struct tcphdr *th = tcp_hdr(skb);
1630 :
1631 : if (!th->syn)
1632 : sk = cookie_v4_check(sk, skb);
1633 : #endif
1634 4 : return sk;
1635 : }
1636 :
1637 0 : u16 tcp_v4_get_syncookie(struct sock *sk, struct iphdr *iph,
1638 : struct tcphdr *th, u32 *cookie)
1639 : {
1640 0 : u16 mss = 0;
1641 : #ifdef CONFIG_SYN_COOKIES
1642 : mss = tcp_get_syncookie_mss(&tcp_request_sock_ops,
1643 : &tcp_request_sock_ipv4_ops, sk, th);
1644 : if (mss) {
1645 : *cookie = __cookie_v4_init_sequence(iph, th, &mss);
1646 : tcp_synq_overflow(sk);
1647 : }
1648 : #endif
1649 0 : return mss;
1650 : }
1651 :
1652 : INDIRECT_CALLABLE_DECLARE(struct dst_entry *ipv4_dst_check(struct dst_entry *,
1653 : u32));
1654 : /* The socket must have it's spinlock held when we get
1655 : * here, unless it is a TCP_LISTEN socket.
1656 : *
1657 : * We have a potential double-lock case here, so even when
1658 : * doing backlog processing we use the BH locking scheme.
1659 : * This is because we cannot sleep with the original spinlock
1660 : * held.
1661 : */
1662 419 : int tcp_v4_do_rcv(struct sock *sk, struct sk_buff *skb)
1663 : {
1664 419 : struct sock *rsk;
1665 :
1666 419 : if (sk->sk_state == TCP_ESTABLISHED) { /* Fast path */
1667 412 : struct dst_entry *dst = sk->sk_rx_dst;
1668 :
1669 412 : sock_rps_save_rxhash(sk, skb);
1670 412 : sk_mark_napi_id(sk, skb);
1671 412 : if (dst) {
1672 824 : if (inet_sk(sk)->rx_dst_ifindex != skb->skb_iif ||
1673 412 : !INDIRECT_CALL_1(dst->ops->check, ipv4_dst_check,
1674 : dst, 0)) {
1675 0 : dst_release(dst);
1676 0 : sk->sk_rx_dst = NULL;
1677 : }
1678 : }
1679 412 : tcp_rcv_established(sk, skb);
1680 412 : return 0;
1681 : }
1682 :
1683 7 : if (tcp_checksum_complete(skb))
1684 0 : goto csum_err;
1685 :
1686 7 : if (sk->sk_state == TCP_LISTEN) {
1687 4 : struct sock *nsk = tcp_v4_cookie_check(sk, skb);
1688 :
1689 4 : if (!nsk)
1690 0 : goto discard;
1691 : if (nsk != sk) {
1692 : if (tcp_child_process(sk, nsk, skb)) {
1693 : rsk = nsk;
1694 : goto reset;
1695 : }
1696 : return 0;
1697 : }
1698 : } else
1699 3 : sock_rps_save_rxhash(sk, skb);
1700 :
1701 7 : if (tcp_rcv_state_process(sk, skb)) {
1702 0 : rsk = sk;
1703 0 : goto reset;
1704 : }
1705 : return 0;
1706 :
1707 0 : reset:
1708 0 : tcp_v4_send_reset(rsk, skb);
1709 0 : discard:
1710 0 : kfree_skb(skb);
1711 : /* Be careful here. If this function gets more complicated and
1712 : * gcc suffers from register pressure on the x86, sk (in %ebx)
1713 : * might be destroyed here. This current version compiles correctly,
1714 : * but you have been warned.
1715 : */
1716 0 : return 0;
1717 :
1718 0 : csum_err:
1719 0 : TCP_INC_STATS(sock_net(sk), TCP_MIB_CSUMERRORS);
1720 0 : TCP_INC_STATS(sock_net(sk), TCP_MIB_INERRS);
1721 0 : goto discard;
1722 : }
1723 : EXPORT_SYMBOL(tcp_v4_do_rcv);
1724 :
1725 389 : int tcp_v4_early_demux(struct sk_buff *skb)
1726 : {
1727 389 : const struct iphdr *iph;
1728 389 : const struct tcphdr *th;
1729 389 : struct sock *sk;
1730 :
1731 389 : if (skb->pkt_type != PACKET_HOST)
1732 : return 0;
1733 :
1734 389 : if (!pskb_may_pull(skb, skb_transport_offset(skb) + sizeof(struct tcphdr)))
1735 : return 0;
1736 :
1737 389 : iph = ip_hdr(skb);
1738 389 : th = tcp_hdr(skb);
1739 :
1740 389 : if (th->doff < sizeof(struct tcphdr) / 4)
1741 : return 0;
1742 :
1743 389 : sk = __inet_lookup_established(dev_net(skb->dev), &tcp_hashinfo,
1744 389 : iph->saddr, th->source,
1745 389 : iph->daddr, ntohs(th->dest),
1746 : skb->skb_iif, inet_sdif(skb));
1747 389 : if (sk) {
1748 385 : skb->sk = sk;
1749 385 : skb->destructor = sock_edemux;
1750 385 : if (sk_fullsock(sk)) {
1751 381 : struct dst_entry *dst = READ_ONCE(sk->sk_rx_dst);
1752 :
1753 381 : if (dst)
1754 381 : dst = dst_check(dst, 0);
1755 381 : if (dst &&
1756 381 : inet_sk(sk)->rx_dst_ifindex == skb->skb_iif)
1757 381 : skb_dst_set_noref(skb, dst);
1758 : }
1759 : }
1760 : return 0;
1761 : }
1762 :
1763 157 : bool tcp_add_backlog(struct sock *sk, struct sk_buff *skb)
1764 : {
1765 157 : u32 limit = READ_ONCE(sk->sk_rcvbuf) + READ_ONCE(sk->sk_sndbuf);
1766 157 : u32 tail_gso_size, tail_gso_segs;
1767 157 : struct skb_shared_info *shinfo;
1768 157 : const struct tcphdr *th;
1769 157 : struct tcphdr *thtail;
1770 157 : struct sk_buff *tail;
1771 157 : unsigned int hdrlen;
1772 157 : bool fragstolen;
1773 157 : u32 gso_segs;
1774 157 : u32 gso_size;
1775 157 : int delta;
1776 :
1777 : /* In case all data was pulled from skb frags (in __pskb_pull_tail()),
1778 : * we can fix skb->truesize to its real value to avoid future drops.
1779 : * This is valid because skb is not yet charged to the socket.
1780 : * It has been noticed pure SACK packets were sometimes dropped
1781 : * (if cooked by drivers without copybreak feature).
1782 : */
1783 157 : skb_condense(skb);
1784 :
1785 157 : skb_dst_drop(skb);
1786 :
1787 157 : if (unlikely(tcp_checksum_complete(skb))) {
1788 0 : bh_unlock_sock(sk);
1789 0 : __TCP_INC_STATS(sock_net(sk), TCP_MIB_CSUMERRORS);
1790 0 : __TCP_INC_STATS(sock_net(sk), TCP_MIB_INERRS);
1791 0 : return true;
1792 : }
1793 :
1794 : /* Attempt coalescing to last skb in backlog, even if we are
1795 : * above the limits.
1796 : * This is okay because skb capacity is limited to MAX_SKB_FRAGS.
1797 : */
1798 157 : th = (const struct tcphdr *)skb->data;
1799 157 : hdrlen = th->doff * 4;
1800 :
1801 157 : tail = sk->sk_backlog.tail;
1802 157 : if (!tail)
1803 141 : goto no_coalesce;
1804 16 : thtail = (struct tcphdr *)tail->data;
1805 :
1806 16 : if (TCP_SKB_CB(tail)->end_seq != TCP_SKB_CB(skb)->seq ||
1807 16 : TCP_SKB_CB(tail)->ip_dsfield != TCP_SKB_CB(skb)->ip_dsfield ||
1808 16 : ((TCP_SKB_CB(tail)->tcp_flags |
1809 16 : TCP_SKB_CB(skb)->tcp_flags) & (TCPHDR_SYN | TCPHDR_RST | TCPHDR_URG)) ||
1810 : !((TCP_SKB_CB(tail)->tcp_flags &
1811 16 : TCP_SKB_CB(skb)->tcp_flags) & TCPHDR_ACK) ||
1812 : ((TCP_SKB_CB(tail)->tcp_flags ^
1813 16 : TCP_SKB_CB(skb)->tcp_flags) & (TCPHDR_ECE | TCPHDR_CWR)) ||
1814 : #ifdef CONFIG_TLS_DEVICE
1815 : tail->decrypted != skb->decrypted ||
1816 : #endif
1817 16 : thtail->doff != th->doff ||
1818 16 : memcmp(thtail + 1, th + 1, hdrlen - sizeof(*th)))
1819 0 : goto no_coalesce;
1820 :
1821 16 : __skb_pull(skb, hdrlen);
1822 :
1823 16 : shinfo = skb_shinfo(skb);
1824 16 : gso_size = shinfo->gso_size ?: skb->len;
1825 16 : gso_segs = shinfo->gso_segs ?: 1;
1826 :
1827 16 : shinfo = skb_shinfo(tail);
1828 16 : tail_gso_size = shinfo->gso_size ?: (tail->len - hdrlen);
1829 16 : tail_gso_segs = shinfo->gso_segs ?: 1;
1830 :
1831 16 : if (skb_try_coalesce(tail, skb, &fragstolen, &delta)) {
1832 15 : TCP_SKB_CB(tail)->end_seq = TCP_SKB_CB(skb)->end_seq;
1833 :
1834 15 : if (likely(!before(TCP_SKB_CB(skb)->ack_seq, TCP_SKB_CB(tail)->ack_seq))) {
1835 15 : TCP_SKB_CB(tail)->ack_seq = TCP_SKB_CB(skb)->ack_seq;
1836 15 : thtail->window = th->window;
1837 : }
1838 :
1839 : /* We have to update both TCP_SKB_CB(tail)->tcp_flags and
1840 : * thtail->fin, so that the fast path in tcp_rcv_established()
1841 : * is not entered if we append a packet with a FIN.
1842 : * SYN, RST, URG are not present.
1843 : * ACK is set on both packets.
1844 : * PSH : we do not really care in TCP stack,
1845 : * at least for 'GRO' packets.
1846 : */
1847 15 : thtail->fin |= th->fin;
1848 15 : TCP_SKB_CB(tail)->tcp_flags |= TCP_SKB_CB(skb)->tcp_flags;
1849 :
1850 15 : if (TCP_SKB_CB(skb)->has_rxtstamp) {
1851 0 : TCP_SKB_CB(tail)->has_rxtstamp = true;
1852 0 : tail->tstamp = skb->tstamp;
1853 0 : skb_hwtstamps(tail)->hwtstamp = skb_hwtstamps(skb)->hwtstamp;
1854 : }
1855 :
1856 : /* Not as strict as GRO. We only need to carry mss max value */
1857 15 : shinfo->gso_size = max(gso_size, tail_gso_size);
1858 15 : shinfo->gso_segs = min_t(u32, gso_segs + tail_gso_segs, 0xFFFF);
1859 :
1860 15 : sk->sk_backlog.len += delta;
1861 15 : __NET_INC_STATS(sock_net(sk),
1862 : LINUX_MIB_TCPBACKLOGCOALESCE);
1863 15 : kfree_skb_partial(skb, fragstolen);
1864 15 : return false;
1865 : }
1866 1 : __skb_push(skb, hdrlen);
1867 :
1868 142 : no_coalesce:
1869 : /* Only socket owner can try to collapse/prune rx queues
1870 : * to reduce memory overhead, so add a little headroom here.
1871 : * Few sockets backlog are possibly concurrently non empty.
1872 : */
1873 142 : limit += 64*1024;
1874 :
1875 142 : if (unlikely(sk_add_backlog(sk, skb, limit))) {
1876 0 : bh_unlock_sock(sk);
1877 0 : __NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPBACKLOGDROP);
1878 0 : return true;
1879 : }
1880 : return false;
1881 : }
1882 : EXPORT_SYMBOL(tcp_add_backlog);
1883 :
1884 438 : int tcp_filter(struct sock *sk, struct sk_buff *skb)
1885 : {
1886 438 : struct tcphdr *th = (struct tcphdr *)skb->data;
1887 :
1888 438 : return sk_filter_trim_cap(sk, skb, th->doff * 4);
1889 : }
1890 : EXPORT_SYMBOL(tcp_filter);
1891 :
1892 0 : static void tcp_v4_restore_cb(struct sk_buff *skb)
1893 : {
1894 0 : memmove(IPCB(skb), &TCP_SKB_CB(skb)->header.h4,
1895 : sizeof(struct inet_skb_parm));
1896 : }
1897 :
1898 438 : static void tcp_v4_fill_cb(struct sk_buff *skb, const struct iphdr *iph,
1899 : const struct tcphdr *th)
1900 : {
1901 : /* This is tricky : We move IPCB at its correct location into TCP_SKB_CB()
1902 : * barrier() makes sure compiler wont play fool^Waliasing games.
1903 : */
1904 438 : memmove(&TCP_SKB_CB(skb)->header.h4, IPCB(skb),
1905 : sizeof(struct inet_skb_parm));
1906 438 : barrier();
1907 :
1908 438 : TCP_SKB_CB(skb)->seq = ntohl(th->seq);
1909 438 : TCP_SKB_CB(skb)->end_seq = (TCP_SKB_CB(skb)->seq + th->syn + th->fin +
1910 438 : skb->len - th->doff * 4);
1911 438 : TCP_SKB_CB(skb)->ack_seq = ntohl(th->ack_seq);
1912 438 : TCP_SKB_CB(skb)->tcp_flags = tcp_flag_byte(th);
1913 438 : TCP_SKB_CB(skb)->tcp_tw_isn = 0;
1914 438 : TCP_SKB_CB(skb)->ip_dsfield = ipv4_get_dsfield(iph);
1915 438 : TCP_SKB_CB(skb)->sacked = 0;
1916 876 : TCP_SKB_CB(skb)->has_rxtstamp =
1917 438 : skb->tstamp || skb_hwtstamps(skb)->hwtstamp;
1918 438 : }
1919 :
1920 : /*
1921 : * From tcp_input.c
1922 : */
1923 :
1924 438 : int tcp_v4_rcv(struct sk_buff *skb)
1925 : {
1926 438 : struct net *net = dev_net(skb->dev);
1927 438 : struct sk_buff *skb_to_free;
1928 438 : int sdif = inet_sdif(skb);
1929 438 : int dif = inet_iif(skb);
1930 438 : const struct iphdr *iph;
1931 438 : const struct tcphdr *th;
1932 438 : bool refcounted;
1933 438 : struct sock *sk;
1934 438 : int ret;
1935 :
1936 438 : if (skb->pkt_type != PACKET_HOST)
1937 0 : goto discard_it;
1938 :
1939 : /* Count it even if it's bad */
1940 438 : __TCP_INC_STATS(net, TCP_MIB_INSEGS);
1941 :
1942 438 : if (!pskb_may_pull(skb, sizeof(struct tcphdr)))
1943 0 : goto discard_it;
1944 :
1945 438 : th = (const struct tcphdr *)skb->data;
1946 :
1947 438 : if (unlikely(th->doff < sizeof(struct tcphdr) / 4))
1948 0 : goto bad_packet;
1949 438 : if (!pskb_may_pull(skb, th->doff * 4))
1950 0 : goto discard_it;
1951 :
1952 : /* An explanation is required here, I think.
1953 : * Packet length and doff are validated by header prediction,
1954 : * provided case of th->doff==0 is eliminated.
1955 : * So, we defer the checks. */
1956 :
1957 438 : if (skb_checksum_init(skb, IPPROTO_TCP, inet_compute_pseudo))
1958 0 : goto csum_error;
1959 :
1960 438 : th = (const struct tcphdr *)skb->data;
1961 438 : iph = ip_hdr(skb);
1962 438 : lookup:
1963 876 : sk = __inet_lookup_skb(&tcp_hashinfo, skb, __tcp_hdrlen(th), th->source,
1964 438 : th->dest, sdif, &refcounted);
1965 438 : if (!sk)
1966 0 : goto no_tcp_socket;
1967 :
1968 438 : process:
1969 438 : if (sk->sk_state == TCP_TIME_WAIT)
1970 0 : goto do_time_wait;
1971 :
1972 438 : if (sk->sk_state == TCP_NEW_SYN_RECV) {
1973 4 : struct request_sock *req = inet_reqsk(sk);
1974 4 : bool req_stolen = false;
1975 4 : struct sock *nsk;
1976 :
1977 4 : sk = req->rsk_listener;
1978 4 : if (unlikely(tcp_v4_inbound_md5_hash(sk, skb, dif, sdif))) {
1979 : sk_drops_add(sk, skb);
1980 : reqsk_put(req);
1981 : goto discard_it;
1982 : }
1983 4 : if (tcp_checksum_complete(skb)) {
1984 0 : reqsk_put(req);
1985 0 : goto csum_error;
1986 : }
1987 4 : if (unlikely(sk->sk_state != TCP_LISTEN)) {
1988 0 : inet_csk_reqsk_queue_drop_and_put(sk, req);
1989 0 : goto lookup;
1990 : }
1991 : /* We own a reference on the listener, increase it again
1992 : * as we might lose it too soon.
1993 : */
1994 4 : sock_hold(sk);
1995 4 : refcounted = true;
1996 4 : nsk = NULL;
1997 4 : if (!tcp_filter(sk, skb)) {
1998 4 : th = (const struct tcphdr *)skb->data;
1999 4 : iph = ip_hdr(skb);
2000 4 : tcp_v4_fill_cb(skb, iph, th);
2001 4 : nsk = tcp_check_req(sk, skb, req, false, &req_stolen);
2002 : }
2003 4 : if (!nsk) {
2004 0 : reqsk_put(req);
2005 0 : if (req_stolen) {
2006 : /* Another cpu got exclusive access to req
2007 : * and created a full blown socket.
2008 : * Try to feed this packet to this socket
2009 : * instead of discarding it.
2010 : */
2011 0 : tcp_v4_restore_cb(skb);
2012 0 : sock_put(sk);
2013 0 : goto lookup;
2014 : }
2015 0 : goto discard_and_relse;
2016 : }
2017 4 : if (nsk == sk) {
2018 0 : reqsk_put(req);
2019 0 : tcp_v4_restore_cb(skb);
2020 4 : } else if (tcp_child_process(sk, nsk, skb)) {
2021 0 : tcp_v4_send_reset(nsk, skb);
2022 0 : goto discard_and_relse;
2023 : } else {
2024 4 : sock_put(sk);
2025 4 : return 0;
2026 : }
2027 : }
2028 434 : if (unlikely(iph->ttl < inet_sk(sk)->min_ttl)) {
2029 0 : __NET_INC_STATS(net, LINUX_MIB_TCPMINTTLDROP);
2030 0 : goto discard_and_relse;
2031 : }
2032 :
2033 434 : if (!xfrm4_policy_check(sk, XFRM_POLICY_IN, skb))
2034 : goto discard_and_relse;
2035 :
2036 434 : if (tcp_v4_inbound_md5_hash(sk, skb, dif, sdif))
2037 : goto discard_and_relse;
2038 :
2039 434 : nf_reset_ct(skb);
2040 :
2041 434 : if (tcp_filter(sk, skb))
2042 0 : goto discard_and_relse;
2043 434 : th = (const struct tcphdr *)skb->data;
2044 434 : iph = ip_hdr(skb);
2045 434 : tcp_v4_fill_cb(skb, iph, th);
2046 :
2047 434 : skb->dev = NULL;
2048 :
2049 434 : if (sk->sk_state == TCP_LISTEN) {
2050 4 : ret = tcp_v4_do_rcv(sk, skb);
2051 4 : goto put_and_return;
2052 : }
2053 :
2054 430 : sk_incoming_cpu_update(sk);
2055 :
2056 430 : bh_lock_sock_nested(sk);
2057 430 : tcp_segs_in(tcp_sk(sk), skb);
2058 430 : ret = 0;
2059 430 : if (!sock_owned_by_user(sk)) {
2060 273 : skb_to_free = sk->sk_rx_skb_cache;
2061 273 : sk->sk_rx_skb_cache = NULL;
2062 273 : ret = tcp_v4_do_rcv(sk, skb);
2063 : } else {
2064 157 : if (tcp_add_backlog(sk, skb))
2065 0 : goto discard_and_relse;
2066 : skb_to_free = NULL;
2067 : }
2068 430 : bh_unlock_sock(sk);
2069 430 : if (skb_to_free)
2070 0 : __kfree_skb(skb_to_free);
2071 :
2072 430 : put_and_return:
2073 434 : if (refcounted)
2074 430 : sock_put(sk);
2075 :
2076 : return ret;
2077 :
2078 0 : no_tcp_socket:
2079 0 : if (!xfrm4_policy_check(NULL, XFRM_POLICY_IN, skb))
2080 : goto discard_it;
2081 :
2082 0 : tcp_v4_fill_cb(skb, iph, th);
2083 :
2084 0 : if (tcp_checksum_complete(skb)) {
2085 0 : csum_error:
2086 0 : __TCP_INC_STATS(net, TCP_MIB_CSUMERRORS);
2087 0 : bad_packet:
2088 0 : __TCP_INC_STATS(net, TCP_MIB_INERRS);
2089 : } else {
2090 0 : tcp_v4_send_reset(NULL, skb);
2091 : }
2092 :
2093 0 : discard_it:
2094 : /* Discard frame. */
2095 0 : kfree_skb(skb);
2096 0 : return 0;
2097 :
2098 0 : discard_and_relse:
2099 0 : sk_drops_add(sk, skb);
2100 0 : if (refcounted)
2101 0 : sock_put(sk);
2102 0 : goto discard_it;
2103 :
2104 0 : do_time_wait:
2105 0 : if (!xfrm4_policy_check(NULL, XFRM_POLICY_IN, skb)) {
2106 : inet_twsk_put(inet_twsk(sk));
2107 : goto discard_it;
2108 : }
2109 :
2110 0 : tcp_v4_fill_cb(skb, iph, th);
2111 :
2112 0 : if (tcp_checksum_complete(skb)) {
2113 0 : inet_twsk_put(inet_twsk(sk));
2114 0 : goto csum_error;
2115 : }
2116 0 : switch (tcp_timewait_state_process(inet_twsk(sk), skb, th)) {
2117 0 : case TCP_TW_SYN: {
2118 0 : struct sock *sk2 = inet_lookup_listener(dev_net(skb->dev),
2119 : &tcp_hashinfo, skb,
2120 0 : __tcp_hdrlen(th),
2121 0 : iph->saddr, th->source,
2122 0 : iph->daddr, th->dest,
2123 : inet_iif(skb),
2124 : sdif);
2125 0 : if (sk2) {
2126 0 : inet_twsk_deschedule_put(inet_twsk(sk));
2127 0 : sk = sk2;
2128 0 : tcp_v4_restore_cb(skb);
2129 0 : refcounted = false;
2130 0 : goto process;
2131 : }
2132 : }
2133 : /* to ACK */
2134 0 : fallthrough;
2135 : case TCP_TW_ACK:
2136 0 : tcp_v4_timewait_ack(sk, skb);
2137 0 : break;
2138 0 : case TCP_TW_RST:
2139 0 : tcp_v4_send_reset(sk, skb);
2140 0 : inet_twsk_deschedule_put(inet_twsk(sk));
2141 0 : goto discard_it;
2142 0 : case TCP_TW_SUCCESS:;
2143 : }
2144 0 : goto discard_it;
2145 : }
2146 :
2147 : static struct timewait_sock_ops tcp_timewait_sock_ops = {
2148 : .twsk_obj_size = sizeof(struct tcp_timewait_sock),
2149 : .twsk_unique = tcp_twsk_unique,
2150 : .twsk_destructor= tcp_twsk_destructor,
2151 : };
2152 :
2153 4 : void inet_sk_rx_dst_set(struct sock *sk, const struct sk_buff *skb)
2154 : {
2155 4 : struct dst_entry *dst = skb_dst(skb);
2156 :
2157 4 : if (dst && dst_hold_safe(dst)) {
2158 4 : sk->sk_rx_dst = dst;
2159 4 : inet_sk(sk)->rx_dst_ifindex = skb->skb_iif;
2160 : }
2161 4 : }
2162 : EXPORT_SYMBOL(inet_sk_rx_dst_set);
2163 :
2164 : const struct inet_connection_sock_af_ops ipv4_specific = {
2165 : .queue_xmit = ip_queue_xmit,
2166 : .send_check = tcp_v4_send_check,
2167 : .rebuild_header = inet_sk_rebuild_header,
2168 : .sk_rx_dst_set = inet_sk_rx_dst_set,
2169 : .conn_request = tcp_v4_conn_request,
2170 : .syn_recv_sock = tcp_v4_syn_recv_sock,
2171 : .net_header_len = sizeof(struct iphdr),
2172 : .setsockopt = ip_setsockopt,
2173 : .getsockopt = ip_getsockopt,
2174 : .addr2sockaddr = inet_csk_addr2sockaddr,
2175 : .sockaddr_len = sizeof(struct sockaddr_in),
2176 : .mtu_reduced = tcp_v4_mtu_reduced,
2177 : };
2178 : EXPORT_SYMBOL(ipv4_specific);
2179 :
2180 : #ifdef CONFIG_TCP_MD5SIG
2181 : static const struct tcp_sock_af_ops tcp_sock_ipv4_specific = {
2182 : .md5_lookup = tcp_v4_md5_lookup,
2183 : .calc_md5_hash = tcp_v4_md5_hash_skb,
2184 : .md5_parse = tcp_v4_parse_md5_keys,
2185 : };
2186 : #endif
2187 :
2188 : /* NOTE: A lot of things set to zero explicitly by call to
2189 : * sk_alloc() so need not be done here.
2190 : */
2191 3 : static int tcp_v4_init_sock(struct sock *sk)
2192 : {
2193 3 : struct inet_connection_sock *icsk = inet_csk(sk);
2194 :
2195 3 : tcp_init_sock(sk);
2196 :
2197 3 : icsk->icsk_af_ops = &ipv4_specific;
2198 :
2199 : #ifdef CONFIG_TCP_MD5SIG
2200 : tcp_sk(sk)->af_specific = &tcp_sock_ipv4_specific;
2201 : #endif
2202 :
2203 3 : return 0;
2204 : }
2205 :
2206 4 : void tcp_v4_destroy_sock(struct sock *sk)
2207 : {
2208 4 : struct tcp_sock *tp = tcp_sk(sk);
2209 :
2210 4 : trace_tcp_destroy_sock(sk);
2211 :
2212 4 : tcp_clear_xmit_timers(sk);
2213 :
2214 4 : tcp_cleanup_congestion_control(sk);
2215 :
2216 4 : tcp_cleanup_ulp(sk);
2217 :
2218 : /* Cleanup up the write buffer. */
2219 4 : tcp_write_queue_purge(sk);
2220 :
2221 : /* Check if we want to disable active TFO */
2222 4 : tcp_fastopen_active_disable_ofo_check(sk);
2223 :
2224 : /* Cleans up our, hopefully empty, out_of_order_queue. */
2225 4 : skb_rbtree_purge(&tp->out_of_order_queue);
2226 :
2227 : #ifdef CONFIG_TCP_MD5SIG
2228 : /* Clean up the MD5 key list, if any */
2229 : if (tp->md5sig_info) {
2230 : tcp_clear_md5_list(sk);
2231 : kfree_rcu(rcu_dereference_protected(tp->md5sig_info, 1), rcu);
2232 : tp->md5sig_info = NULL;
2233 : }
2234 : #endif
2235 :
2236 : /* Clean up a referenced TCP bind bucket. */
2237 4 : if (inet_csk(sk)->icsk_bind_hash)
2238 1 : inet_put_port(sk);
2239 :
2240 4 : BUG_ON(rcu_access_pointer(tp->fastopen_rsk));
2241 :
2242 : /* If socket is aborted during connect operation */
2243 4 : tcp_free_fastopen_req(tp);
2244 4 : tcp_fastopen_destroy_cipher(sk);
2245 4 : tcp_saved_syn_free(tp);
2246 :
2247 4 : sk_sockets_allocated_dec(sk);
2248 4 : }
2249 : EXPORT_SYMBOL(tcp_v4_destroy_sock);
2250 :
2251 : #ifdef CONFIG_PROC_FS
2252 : /* Proc filesystem TCP sock list dumping. */
2253 :
2254 : /*
2255 : * Get next listener socket follow cur. If cur is NULL, get first socket
2256 : * starting from bucket given in st->bucket; when st->bucket is zero the
2257 : * very first socket in the hash table is returned.
2258 : */
2259 0 : static void *listening_get_next(struct seq_file *seq, void *cur)
2260 : {
2261 0 : struct tcp_seq_afinfo *afinfo;
2262 0 : struct tcp_iter_state *st = seq->private;
2263 0 : struct net *net = seq_file_net(seq);
2264 0 : struct inet_listen_hashbucket *ilb;
2265 0 : struct hlist_nulls_node *node;
2266 0 : struct sock *sk = cur;
2267 :
2268 0 : if (st->bpf_seq_afinfo)
2269 : afinfo = st->bpf_seq_afinfo;
2270 : else
2271 0 : afinfo = PDE_DATA(file_inode(seq->file));
2272 :
2273 0 : if (!sk) {
2274 0 : get_head:
2275 0 : ilb = &tcp_hashinfo.listening_hash[st->bucket];
2276 0 : spin_lock(&ilb->lock);
2277 0 : sk = sk_nulls_head(&ilb->nulls_head);
2278 0 : st->offset = 0;
2279 0 : goto get_sk;
2280 : }
2281 0 : ilb = &tcp_hashinfo.listening_hash[st->bucket];
2282 0 : ++st->num;
2283 0 : ++st->offset;
2284 :
2285 0 : sk = sk_nulls_next(sk);
2286 0 : get_sk:
2287 0 : sk_nulls_for_each_from(sk, node) {
2288 0 : if (!net_eq(sock_net(sk), net))
2289 : continue;
2290 0 : if (afinfo->family == AF_UNSPEC ||
2291 0 : sk->sk_family == afinfo->family)
2292 0 : return sk;
2293 : }
2294 0 : spin_unlock(&ilb->lock);
2295 0 : st->offset = 0;
2296 0 : if (++st->bucket < INET_LHTABLE_SIZE)
2297 0 : goto get_head;
2298 : return NULL;
2299 : }
2300 :
2301 0 : static void *listening_get_idx(struct seq_file *seq, loff_t *pos)
2302 : {
2303 0 : struct tcp_iter_state *st = seq->private;
2304 0 : void *rc;
2305 :
2306 0 : st->bucket = 0;
2307 0 : st->offset = 0;
2308 0 : rc = listening_get_next(seq, NULL);
2309 :
2310 0 : while (rc && *pos) {
2311 0 : rc = listening_get_next(seq, rc);
2312 0 : --*pos;
2313 : }
2314 0 : return rc;
2315 : }
2316 :
2317 0 : static inline bool empty_bucket(const struct tcp_iter_state *st)
2318 : {
2319 0 : return hlist_nulls_empty(&tcp_hashinfo.ehash[st->bucket].chain);
2320 : }
2321 :
2322 : /*
2323 : * Get first established socket starting from bucket given in st->bucket.
2324 : * If st->bucket is zero, the very first socket in the hash is returned.
2325 : */
2326 0 : static void *established_get_first(struct seq_file *seq)
2327 : {
2328 0 : struct tcp_seq_afinfo *afinfo;
2329 0 : struct tcp_iter_state *st = seq->private;
2330 0 : struct net *net = seq_file_net(seq);
2331 0 : void *rc = NULL;
2332 :
2333 0 : if (st->bpf_seq_afinfo)
2334 : afinfo = st->bpf_seq_afinfo;
2335 : else
2336 0 : afinfo = PDE_DATA(file_inode(seq->file));
2337 :
2338 0 : st->offset = 0;
2339 0 : for (; st->bucket <= tcp_hashinfo.ehash_mask; ++st->bucket) {
2340 0 : struct sock *sk;
2341 0 : struct hlist_nulls_node *node;
2342 0 : spinlock_t *lock = inet_ehash_lockp(&tcp_hashinfo, st->bucket);
2343 :
2344 : /* Lockless fast path for the common case of empty buckets */
2345 0 : if (empty_bucket(st))
2346 0 : continue;
2347 :
2348 0 : spin_lock_bh(lock);
2349 0 : sk_nulls_for_each(sk, node, &tcp_hashinfo.ehash[st->bucket].chain) {
2350 0 : if ((afinfo->family != AF_UNSPEC &&
2351 0 : sk->sk_family != afinfo->family) ||
2352 0 : !net_eq(sock_net(sk), net)) {
2353 0 : continue;
2354 : }
2355 0 : rc = sk;
2356 0 : goto out;
2357 : }
2358 0 : spin_unlock_bh(lock);
2359 : }
2360 0 : out:
2361 0 : return rc;
2362 : }
2363 :
2364 0 : static void *established_get_next(struct seq_file *seq, void *cur)
2365 : {
2366 0 : struct tcp_seq_afinfo *afinfo;
2367 0 : struct sock *sk = cur;
2368 0 : struct hlist_nulls_node *node;
2369 0 : struct tcp_iter_state *st = seq->private;
2370 0 : struct net *net = seq_file_net(seq);
2371 :
2372 0 : if (st->bpf_seq_afinfo)
2373 : afinfo = st->bpf_seq_afinfo;
2374 : else
2375 0 : afinfo = PDE_DATA(file_inode(seq->file));
2376 :
2377 0 : ++st->num;
2378 0 : ++st->offset;
2379 :
2380 0 : sk = sk_nulls_next(sk);
2381 :
2382 0 : sk_nulls_for_each_from(sk, node) {
2383 0 : if ((afinfo->family == AF_UNSPEC ||
2384 0 : sk->sk_family == afinfo->family) &&
2385 0 : net_eq(sock_net(sk), net))
2386 0 : return sk;
2387 : }
2388 :
2389 0 : spin_unlock_bh(inet_ehash_lockp(&tcp_hashinfo, st->bucket));
2390 0 : ++st->bucket;
2391 0 : return established_get_first(seq);
2392 : }
2393 :
2394 0 : static void *established_get_idx(struct seq_file *seq, loff_t pos)
2395 : {
2396 0 : struct tcp_iter_state *st = seq->private;
2397 0 : void *rc;
2398 :
2399 0 : st->bucket = 0;
2400 0 : rc = established_get_first(seq);
2401 :
2402 0 : while (rc && pos) {
2403 0 : rc = established_get_next(seq, rc);
2404 0 : --pos;
2405 : }
2406 0 : return rc;
2407 : }
2408 :
2409 0 : static void *tcp_get_idx(struct seq_file *seq, loff_t pos)
2410 : {
2411 0 : void *rc;
2412 0 : struct tcp_iter_state *st = seq->private;
2413 :
2414 0 : st->state = TCP_SEQ_STATE_LISTENING;
2415 0 : rc = listening_get_idx(seq, &pos);
2416 :
2417 0 : if (!rc) {
2418 0 : st->state = TCP_SEQ_STATE_ESTABLISHED;
2419 0 : rc = established_get_idx(seq, pos);
2420 : }
2421 :
2422 0 : return rc;
2423 : }
2424 :
2425 0 : static void *tcp_seek_last_pos(struct seq_file *seq)
2426 : {
2427 0 : struct tcp_iter_state *st = seq->private;
2428 0 : int offset = st->offset;
2429 0 : int orig_num = st->num;
2430 0 : void *rc = NULL;
2431 :
2432 0 : switch (st->state) {
2433 0 : case TCP_SEQ_STATE_LISTENING:
2434 0 : if (st->bucket >= INET_LHTABLE_SIZE)
2435 : break;
2436 0 : st->state = TCP_SEQ_STATE_LISTENING;
2437 0 : rc = listening_get_next(seq, NULL);
2438 0 : while (offset-- && rc)
2439 0 : rc = listening_get_next(seq, rc);
2440 0 : if (rc)
2441 : break;
2442 0 : st->bucket = 0;
2443 0 : st->state = TCP_SEQ_STATE_ESTABLISHED;
2444 0 : fallthrough;
2445 0 : case TCP_SEQ_STATE_ESTABLISHED:
2446 0 : if (st->bucket > tcp_hashinfo.ehash_mask)
2447 : break;
2448 0 : rc = established_get_first(seq);
2449 0 : while (offset-- && rc)
2450 0 : rc = established_get_next(seq, rc);
2451 : }
2452 :
2453 0 : st->num = orig_num;
2454 :
2455 0 : return rc;
2456 : }
2457 :
2458 0 : void *tcp_seq_start(struct seq_file *seq, loff_t *pos)
2459 : {
2460 0 : struct tcp_iter_state *st = seq->private;
2461 0 : void *rc;
2462 :
2463 0 : if (*pos && *pos == st->last_pos) {
2464 0 : rc = tcp_seek_last_pos(seq);
2465 0 : if (rc)
2466 0 : goto out;
2467 : }
2468 :
2469 0 : st->state = TCP_SEQ_STATE_LISTENING;
2470 0 : st->num = 0;
2471 0 : st->bucket = 0;
2472 0 : st->offset = 0;
2473 0 : rc = *pos ? tcp_get_idx(seq, *pos - 1) : SEQ_START_TOKEN;
2474 :
2475 0 : out:
2476 0 : st->last_pos = *pos;
2477 0 : return rc;
2478 : }
2479 : EXPORT_SYMBOL(tcp_seq_start);
2480 :
2481 0 : void *tcp_seq_next(struct seq_file *seq, void *v, loff_t *pos)
2482 : {
2483 0 : struct tcp_iter_state *st = seq->private;
2484 0 : void *rc = NULL;
2485 :
2486 0 : if (v == SEQ_START_TOKEN) {
2487 0 : rc = tcp_get_idx(seq, 0);
2488 0 : goto out;
2489 : }
2490 :
2491 0 : switch (st->state) {
2492 0 : case TCP_SEQ_STATE_LISTENING:
2493 0 : rc = listening_get_next(seq, v);
2494 0 : if (!rc) {
2495 0 : st->state = TCP_SEQ_STATE_ESTABLISHED;
2496 0 : st->bucket = 0;
2497 0 : st->offset = 0;
2498 0 : rc = established_get_first(seq);
2499 : }
2500 : break;
2501 0 : case TCP_SEQ_STATE_ESTABLISHED:
2502 0 : rc = established_get_next(seq, v);
2503 0 : break;
2504 : }
2505 0 : out:
2506 0 : ++*pos;
2507 0 : st->last_pos = *pos;
2508 0 : return rc;
2509 : }
2510 : EXPORT_SYMBOL(tcp_seq_next);
2511 :
2512 0 : void tcp_seq_stop(struct seq_file *seq, void *v)
2513 : {
2514 0 : struct tcp_iter_state *st = seq->private;
2515 :
2516 0 : switch (st->state) {
2517 0 : case TCP_SEQ_STATE_LISTENING:
2518 0 : if (v != SEQ_START_TOKEN)
2519 0 : spin_unlock(&tcp_hashinfo.listening_hash[st->bucket].lock);
2520 : break;
2521 0 : case TCP_SEQ_STATE_ESTABLISHED:
2522 0 : if (v)
2523 0 : spin_unlock_bh(inet_ehash_lockp(&tcp_hashinfo, st->bucket));
2524 : break;
2525 : }
2526 0 : }
2527 : EXPORT_SYMBOL(tcp_seq_stop);
2528 :
2529 0 : static void get_openreq4(const struct request_sock *req,
2530 : struct seq_file *f, int i)
2531 : {
2532 0 : const struct inet_request_sock *ireq = inet_rsk(req);
2533 0 : long delta = req->rsk_timer.expires - jiffies;
2534 :
2535 0 : seq_printf(f, "%4d: %08X:%04X %08X:%04X"
2536 : " %02X %08X:%08X %02X:%08lX %08X %5u %8d %u %d %pK",
2537 : i,
2538 : ireq->ir_loc_addr,
2539 0 : ireq->ir_num,
2540 : ireq->ir_rmt_addr,
2541 0 : ntohs(ireq->ir_rmt_port),
2542 : TCP_SYN_RECV,
2543 : 0, 0, /* could print option size, but that is af dependent. */
2544 : 1, /* timers active (only the expire timer) */
2545 : jiffies_delta_to_clock_t(delta),
2546 0 : req->num_timeout,
2547 : from_kuid_munged(seq_user_ns(f),
2548 : sock_i_uid(req->rsk_listener)),
2549 : 0, /* non standard timer */
2550 : 0, /* open_requests have no inode */
2551 : 0,
2552 : req);
2553 0 : }
2554 :
2555 0 : static void get_tcp4_sock(struct sock *sk, struct seq_file *f, int i)
2556 : {
2557 0 : int timer_active;
2558 0 : unsigned long timer_expires;
2559 0 : const struct tcp_sock *tp = tcp_sk(sk);
2560 0 : const struct inet_connection_sock *icsk = inet_csk(sk);
2561 0 : const struct inet_sock *inet = inet_sk(sk);
2562 0 : const struct fastopen_queue *fastopenq = &icsk->icsk_accept_queue.fastopenq;
2563 0 : __be32 dest = inet->inet_daddr;
2564 0 : __be32 src = inet->inet_rcv_saddr;
2565 0 : __u16 destp = ntohs(inet->inet_dport);
2566 0 : __u16 srcp = ntohs(inet->inet_sport);
2567 0 : int rx_queue;
2568 0 : int state;
2569 :
2570 0 : if (icsk->icsk_pending == ICSK_TIME_RETRANS ||
2571 0 : icsk->icsk_pending == ICSK_TIME_REO_TIMEOUT ||
2572 : icsk->icsk_pending == ICSK_TIME_LOSS_PROBE) {
2573 0 : timer_active = 1;
2574 0 : timer_expires = icsk->icsk_timeout;
2575 0 : } else if (icsk->icsk_pending == ICSK_TIME_PROBE0) {
2576 0 : timer_active = 4;
2577 0 : timer_expires = icsk->icsk_timeout;
2578 0 : } else if (timer_pending(&sk->sk_timer)) {
2579 0 : timer_active = 2;
2580 0 : timer_expires = sk->sk_timer.expires;
2581 : } else {
2582 0 : timer_active = 0;
2583 0 : timer_expires = jiffies;
2584 : }
2585 :
2586 0 : state = inet_sk_state_load(sk);
2587 0 : if (state == TCP_LISTEN)
2588 0 : rx_queue = READ_ONCE(sk->sk_ack_backlog);
2589 : else
2590 : /* Because we don't lock the socket,
2591 : * we might find a transient negative value.
2592 : */
2593 0 : rx_queue = max_t(int, READ_ONCE(tp->rcv_nxt) -
2594 : READ_ONCE(tp->copied_seq), 0);
2595 :
2596 0 : seq_printf(f, "%4d: %08X:%04X %08X:%04X %02X %08X:%08X %02X:%08lX "
2597 : "%08X %5u %8d %lu %d %pK %lu %lu %u %u %d",
2598 : i, src, srcp, dest, destp, state,
2599 0 : READ_ONCE(tp->write_seq) - tp->snd_una,
2600 : rx_queue,
2601 : timer_active,
2602 0 : jiffies_delta_to_clock_t(timer_expires - jiffies),
2603 0 : icsk->icsk_retransmits,
2604 : from_kuid_munged(seq_user_ns(f), sock_i_uid(sk)),
2605 0 : icsk->icsk_probes_out,
2606 : sock_i_ino(sk),
2607 0 : refcount_read(&sk->sk_refcnt), sk,
2608 0 : jiffies_to_clock_t(icsk->icsk_rto),
2609 0 : jiffies_to_clock_t(icsk->icsk_ack.ato),
2610 0 : (icsk->icsk_ack.quick << 1) | inet_csk_in_pingpong_mode(sk),
2611 : tp->snd_cwnd,
2612 : state == TCP_LISTEN ?
2613 0 : fastopenq->max_qlen :
2614 0 : (tcp_in_initial_slowstart(tp) ? -1 : tp->snd_ssthresh));
2615 0 : }
2616 :
2617 0 : static void get_timewait4_sock(const struct inet_timewait_sock *tw,
2618 : struct seq_file *f, int i)
2619 : {
2620 0 : long delta = tw->tw_timer.expires - jiffies;
2621 0 : __be32 dest, src;
2622 0 : __u16 destp, srcp;
2623 :
2624 0 : dest = tw->tw_daddr;
2625 0 : src = tw->tw_rcv_saddr;
2626 0 : destp = ntohs(tw->tw_dport);
2627 0 : srcp = ntohs(tw->tw_sport);
2628 :
2629 0 : seq_printf(f, "%4d: %08X:%04X %08X:%04X"
2630 : " %02X %08X:%08X %02X:%08lX %08X %5d %8d %d %d %pK",
2631 0 : i, src, srcp, dest, destp, tw->tw_substate, 0, 0,
2632 : 3, jiffies_delta_to_clock_t(delta), 0, 0, 0, 0,
2633 : refcount_read(&tw->tw_refcnt), tw);
2634 0 : }
2635 :
2636 : #define TMPSZ 150
2637 :
2638 0 : static int tcp4_seq_show(struct seq_file *seq, void *v)
2639 : {
2640 0 : struct tcp_iter_state *st;
2641 0 : struct sock *sk = v;
2642 :
2643 0 : seq_setwidth(seq, TMPSZ - 1);
2644 0 : if (v == SEQ_START_TOKEN) {
2645 0 : seq_puts(seq, " sl local_address rem_address st tx_queue "
2646 : "rx_queue tr tm->when retrnsmt uid timeout "
2647 : "inode");
2648 0 : goto out;
2649 : }
2650 0 : st = seq->private;
2651 :
2652 0 : if (sk->sk_state == TCP_TIME_WAIT)
2653 0 : get_timewait4_sock(v, seq, st->num);
2654 0 : else if (sk->sk_state == TCP_NEW_SYN_RECV)
2655 0 : get_openreq4(v, seq, st->num);
2656 : else
2657 0 : get_tcp4_sock(v, seq, st->num);
2658 0 : out:
2659 0 : seq_pad(seq, '\n');
2660 0 : return 0;
2661 : }
2662 :
2663 : #ifdef CONFIG_BPF_SYSCALL
2664 : struct bpf_iter__tcp {
2665 : __bpf_md_ptr(struct bpf_iter_meta *, meta);
2666 : __bpf_md_ptr(struct sock_common *, sk_common);
2667 : uid_t uid __aligned(8);
2668 : };
2669 :
2670 : static int tcp_prog_seq_show(struct bpf_prog *prog, struct bpf_iter_meta *meta,
2671 : struct sock_common *sk_common, uid_t uid)
2672 : {
2673 : struct bpf_iter__tcp ctx;
2674 :
2675 : meta->seq_num--; /* skip SEQ_START_TOKEN */
2676 : ctx.meta = meta;
2677 : ctx.sk_common = sk_common;
2678 : ctx.uid = uid;
2679 : return bpf_iter_run_prog(prog, &ctx);
2680 : }
2681 :
2682 : static int bpf_iter_tcp_seq_show(struct seq_file *seq, void *v)
2683 : {
2684 : struct bpf_iter_meta meta;
2685 : struct bpf_prog *prog;
2686 : struct sock *sk = v;
2687 : uid_t uid;
2688 :
2689 : if (v == SEQ_START_TOKEN)
2690 : return 0;
2691 :
2692 : if (sk->sk_state == TCP_TIME_WAIT) {
2693 : uid = 0;
2694 : } else if (sk->sk_state == TCP_NEW_SYN_RECV) {
2695 : const struct request_sock *req = v;
2696 :
2697 : uid = from_kuid_munged(seq_user_ns(seq),
2698 : sock_i_uid(req->rsk_listener));
2699 : } else {
2700 : uid = from_kuid_munged(seq_user_ns(seq), sock_i_uid(sk));
2701 : }
2702 :
2703 : meta.seq = seq;
2704 : prog = bpf_iter_get_info(&meta, false);
2705 : return tcp_prog_seq_show(prog, &meta, v, uid);
2706 : }
2707 :
2708 : static void bpf_iter_tcp_seq_stop(struct seq_file *seq, void *v)
2709 : {
2710 : struct bpf_iter_meta meta;
2711 : struct bpf_prog *prog;
2712 :
2713 : if (!v) {
2714 : meta.seq = seq;
2715 : prog = bpf_iter_get_info(&meta, true);
2716 : if (prog)
2717 : (void)tcp_prog_seq_show(prog, &meta, v, 0);
2718 : }
2719 :
2720 : tcp_seq_stop(seq, v);
2721 : }
2722 :
2723 : static const struct seq_operations bpf_iter_tcp_seq_ops = {
2724 : .show = bpf_iter_tcp_seq_show,
2725 : .start = tcp_seq_start,
2726 : .next = tcp_seq_next,
2727 : .stop = bpf_iter_tcp_seq_stop,
2728 : };
2729 : #endif
2730 :
2731 : static const struct seq_operations tcp4_seq_ops = {
2732 : .show = tcp4_seq_show,
2733 : .start = tcp_seq_start,
2734 : .next = tcp_seq_next,
2735 : .stop = tcp_seq_stop,
2736 : };
2737 :
2738 : static struct tcp_seq_afinfo tcp4_seq_afinfo = {
2739 : .family = AF_INET,
2740 : };
2741 :
2742 1 : static int __net_init tcp4_proc_init_net(struct net *net)
2743 : {
2744 1 : if (!proc_create_net_data("tcp", 0444, net->proc_net, &tcp4_seq_ops,
2745 : sizeof(struct tcp_iter_state), &tcp4_seq_afinfo))
2746 0 : return -ENOMEM;
2747 : return 0;
2748 : }
2749 :
2750 0 : static void __net_exit tcp4_proc_exit_net(struct net *net)
2751 : {
2752 0 : remove_proc_entry("tcp", net->proc_net);
2753 0 : }
2754 :
2755 : static struct pernet_operations tcp4_net_ops = {
2756 : .init = tcp4_proc_init_net,
2757 : .exit = tcp4_proc_exit_net,
2758 : };
2759 :
2760 1 : int __init tcp4_proc_init(void)
2761 : {
2762 1 : return register_pernet_subsys(&tcp4_net_ops);
2763 : }
2764 :
2765 0 : void tcp4_proc_exit(void)
2766 : {
2767 0 : unregister_pernet_subsys(&tcp4_net_ops);
2768 0 : }
2769 : #endif /* CONFIG_PROC_FS */
2770 :
2771 : /* @wake is one when sk_stream_write_space() calls us.
2772 : * This sends EPOLLOUT only if notsent_bytes is half the limit.
2773 : * This mimics the strategy used in sock_def_write_space().
2774 : */
2775 1046 : bool tcp_stream_memory_free(const struct sock *sk, int wake)
2776 : {
2777 1046 : const struct tcp_sock *tp = tcp_sk(sk);
2778 1046 : u32 notsent_bytes = READ_ONCE(tp->write_seq) -
2779 1046 : READ_ONCE(tp->snd_nxt);
2780 :
2781 1046 : return (notsent_bytes << wake) < tcp_notsent_lowat(tp);
2782 : }
2783 : EXPORT_SYMBOL(tcp_stream_memory_free);
2784 :
2785 : struct proto tcp_prot = {
2786 : .name = "TCP",
2787 : .owner = THIS_MODULE,
2788 : .close = tcp_close,
2789 : .pre_connect = tcp_v4_pre_connect,
2790 : .connect = tcp_v4_connect,
2791 : .disconnect = tcp_disconnect,
2792 : .accept = inet_csk_accept,
2793 : .ioctl = tcp_ioctl,
2794 : .init = tcp_v4_init_sock,
2795 : .destroy = tcp_v4_destroy_sock,
2796 : .shutdown = tcp_shutdown,
2797 : .setsockopt = tcp_setsockopt,
2798 : .getsockopt = tcp_getsockopt,
2799 : .bpf_bypass_getsockopt = tcp_bpf_bypass_getsockopt,
2800 : .keepalive = tcp_set_keepalive,
2801 : .recvmsg = tcp_recvmsg,
2802 : .sendmsg = tcp_sendmsg,
2803 : .sendpage = tcp_sendpage,
2804 : .backlog_rcv = tcp_v4_do_rcv,
2805 : .release_cb = tcp_release_cb,
2806 : .hash = inet_hash,
2807 : .unhash = inet_unhash,
2808 : .get_port = inet_csk_get_port,
2809 : .enter_memory_pressure = tcp_enter_memory_pressure,
2810 : .leave_memory_pressure = tcp_leave_memory_pressure,
2811 : .stream_memory_free = tcp_stream_memory_free,
2812 : .sockets_allocated = &tcp_sockets_allocated,
2813 : .orphan_count = &tcp_orphan_count,
2814 : .memory_allocated = &tcp_memory_allocated,
2815 : .memory_pressure = &tcp_memory_pressure,
2816 : .sysctl_mem = sysctl_tcp_mem,
2817 : .sysctl_wmem_offset = offsetof(struct net, ipv4.sysctl_tcp_wmem),
2818 : .sysctl_rmem_offset = offsetof(struct net, ipv4.sysctl_tcp_rmem),
2819 : .max_header = MAX_TCP_HEADER,
2820 : .obj_size = sizeof(struct tcp_sock),
2821 : .slab_flags = SLAB_TYPESAFE_BY_RCU,
2822 : .twsk_prot = &tcp_timewait_sock_ops,
2823 : .rsk_prot = &tcp_request_sock_ops,
2824 : .h.hashinfo = &tcp_hashinfo,
2825 : .no_autobind = true,
2826 : .diag_destroy = tcp_abort,
2827 : };
2828 : EXPORT_SYMBOL(tcp_prot);
2829 :
2830 0 : static void __net_exit tcp_sk_exit(struct net *net)
2831 : {
2832 0 : int cpu;
2833 :
2834 0 : if (net->ipv4.tcp_congestion_control)
2835 0 : bpf_module_put(net->ipv4.tcp_congestion_control,
2836 : net->ipv4.tcp_congestion_control->owner);
2837 :
2838 0 : for_each_possible_cpu(cpu)
2839 0 : inet_ctl_sock_destroy(*per_cpu_ptr(net->ipv4.tcp_sk, cpu));
2840 0 : free_percpu(net->ipv4.tcp_sk);
2841 0 : }
2842 :
2843 1 : static int __net_init tcp_sk_init(struct net *net)
2844 : {
2845 1 : int res, cpu, cnt;
2846 :
2847 1 : net->ipv4.tcp_sk = alloc_percpu(struct sock *);
2848 1 : if (!net->ipv4.tcp_sk)
2849 : return -ENOMEM;
2850 :
2851 5 : for_each_possible_cpu(cpu) {
2852 4 : struct sock *sk;
2853 :
2854 4 : res = inet_ctl_sock_create(&sk, PF_INET, SOCK_RAW,
2855 : IPPROTO_TCP, net);
2856 4 : if (res)
2857 0 : goto fail;
2858 4 : sock_set_flag(sk, SOCK_USE_WRITE_QUEUE);
2859 :
2860 : /* Please enforce IP_DF and IPID==0 for RST and
2861 : * ACK sent in SYN-RECV and TIME-WAIT state.
2862 : */
2863 4 : inet_sk(sk)->pmtudisc = IP_PMTUDISC_DO;
2864 :
2865 4 : *per_cpu_ptr(net->ipv4.tcp_sk, cpu) = sk;
2866 : }
2867 :
2868 1 : net->ipv4.sysctl_tcp_ecn = 2;
2869 1 : net->ipv4.sysctl_tcp_ecn_fallback = 1;
2870 :
2871 1 : net->ipv4.sysctl_tcp_base_mss = TCP_BASE_MSS;
2872 1 : net->ipv4.sysctl_tcp_min_snd_mss = TCP_MIN_SND_MSS;
2873 1 : net->ipv4.sysctl_tcp_probe_threshold = TCP_PROBE_THRESHOLD;
2874 1 : net->ipv4.sysctl_tcp_probe_interval = TCP_PROBE_INTERVAL;
2875 1 : net->ipv4.sysctl_tcp_mtu_probe_floor = TCP_MIN_SND_MSS;
2876 :
2877 1 : net->ipv4.sysctl_tcp_keepalive_time = TCP_KEEPALIVE_TIME;
2878 1 : net->ipv4.sysctl_tcp_keepalive_probes = TCP_KEEPALIVE_PROBES;
2879 1 : net->ipv4.sysctl_tcp_keepalive_intvl = TCP_KEEPALIVE_INTVL;
2880 :
2881 1 : net->ipv4.sysctl_tcp_syn_retries = TCP_SYN_RETRIES;
2882 1 : net->ipv4.sysctl_tcp_synack_retries = TCP_SYNACK_RETRIES;
2883 1 : net->ipv4.sysctl_tcp_syncookies = 1;
2884 1 : net->ipv4.sysctl_tcp_reordering = TCP_FASTRETRANS_THRESH;
2885 1 : net->ipv4.sysctl_tcp_retries1 = TCP_RETR1;
2886 1 : net->ipv4.sysctl_tcp_retries2 = TCP_RETR2;
2887 1 : net->ipv4.sysctl_tcp_orphan_retries = 0;
2888 1 : net->ipv4.sysctl_tcp_fin_timeout = TCP_FIN_TIMEOUT;
2889 1 : net->ipv4.sysctl_tcp_notsent_lowat = UINT_MAX;
2890 1 : net->ipv4.sysctl_tcp_tw_reuse = 2;
2891 1 : net->ipv4.sysctl_tcp_no_ssthresh_metrics_save = 1;
2892 :
2893 1 : cnt = tcp_hashinfo.ehash_mask + 1;
2894 1 : net->ipv4.tcp_death_row.sysctl_max_tw_buckets = cnt / 2;
2895 1 : net->ipv4.tcp_death_row.hashinfo = &tcp_hashinfo;
2896 :
2897 1 : net->ipv4.sysctl_max_syn_backlog = max(128, cnt / 128);
2898 1 : net->ipv4.sysctl_tcp_sack = 1;
2899 1 : net->ipv4.sysctl_tcp_window_scaling = 1;
2900 1 : net->ipv4.sysctl_tcp_timestamps = 1;
2901 1 : net->ipv4.sysctl_tcp_early_retrans = 3;
2902 1 : net->ipv4.sysctl_tcp_recovery = TCP_RACK_LOSS_DETECTION;
2903 1 : net->ipv4.sysctl_tcp_slow_start_after_idle = 1; /* By default, RFC2861 behavior. */
2904 1 : net->ipv4.sysctl_tcp_retrans_collapse = 1;
2905 1 : net->ipv4.sysctl_tcp_max_reordering = 300;
2906 1 : net->ipv4.sysctl_tcp_dsack = 1;
2907 1 : net->ipv4.sysctl_tcp_app_win = 31;
2908 1 : net->ipv4.sysctl_tcp_adv_win_scale = 1;
2909 1 : net->ipv4.sysctl_tcp_frto = 2;
2910 1 : net->ipv4.sysctl_tcp_moderate_rcvbuf = 1;
2911 : /* This limits the percentage of the congestion window which we
2912 : * will allow a single TSO frame to consume. Building TSO frames
2913 : * which are too large can cause TCP streams to be bursty.
2914 : */
2915 1 : net->ipv4.sysctl_tcp_tso_win_divisor = 3;
2916 : /* Default TSQ limit of 16 TSO segments */
2917 1 : net->ipv4.sysctl_tcp_limit_output_bytes = 16 * 65536;
2918 : /* rfc5961 challenge ack rate limiting */
2919 1 : net->ipv4.sysctl_tcp_challenge_ack_limit = 1000;
2920 1 : net->ipv4.sysctl_tcp_min_tso_segs = 2;
2921 1 : net->ipv4.sysctl_tcp_min_rtt_wlen = 300;
2922 1 : net->ipv4.sysctl_tcp_autocorking = 1;
2923 1 : net->ipv4.sysctl_tcp_invalid_ratelimit = HZ/2;
2924 1 : net->ipv4.sysctl_tcp_pacing_ss_ratio = 200;
2925 1 : net->ipv4.sysctl_tcp_pacing_ca_ratio = 120;
2926 1 : if (net != &init_net) {
2927 0 : memcpy(net->ipv4.sysctl_tcp_rmem,
2928 : init_net.ipv4.sysctl_tcp_rmem,
2929 : sizeof(init_net.ipv4.sysctl_tcp_rmem));
2930 0 : memcpy(net->ipv4.sysctl_tcp_wmem,
2931 : init_net.ipv4.sysctl_tcp_wmem,
2932 : sizeof(init_net.ipv4.sysctl_tcp_wmem));
2933 : }
2934 1 : net->ipv4.sysctl_tcp_comp_sack_delay_ns = NSEC_PER_MSEC;
2935 1 : net->ipv4.sysctl_tcp_comp_sack_slack_ns = 100 * NSEC_PER_USEC;
2936 1 : net->ipv4.sysctl_tcp_comp_sack_nr = 44;
2937 1 : net->ipv4.sysctl_tcp_fastopen = TFO_CLIENT_ENABLE;
2938 1 : spin_lock_init(&net->ipv4.tcp_fastopen_ctx_lock);
2939 1 : net->ipv4.sysctl_tcp_fastopen_blackhole_timeout = 60 * 60;
2940 1 : atomic_set(&net->ipv4.tfo_active_disable_times, 0);
2941 :
2942 : /* Reno is always built in */
2943 1 : if (!net_eq(net, &init_net) &&
2944 : bpf_try_module_get(init_net.ipv4.tcp_congestion_control,
2945 : init_net.ipv4.tcp_congestion_control->owner))
2946 : net->ipv4.tcp_congestion_control = init_net.ipv4.tcp_congestion_control;
2947 : else
2948 1 : net->ipv4.tcp_congestion_control = &tcp_reno;
2949 :
2950 1 : return 0;
2951 0 : fail:
2952 0 : tcp_sk_exit(net);
2953 :
2954 0 : return res;
2955 : }
2956 :
2957 0 : static void __net_exit tcp_sk_exit_batch(struct list_head *net_exit_list)
2958 : {
2959 0 : struct net *net;
2960 :
2961 0 : inet_twsk_purge(&tcp_hashinfo, AF_INET);
2962 :
2963 0 : list_for_each_entry(net, net_exit_list, exit_list)
2964 0 : tcp_fastopen_ctx_destroy(net);
2965 0 : }
2966 :
2967 : static struct pernet_operations __net_initdata tcp_sk_ops = {
2968 : .init = tcp_sk_init,
2969 : .exit = tcp_sk_exit,
2970 : .exit_batch = tcp_sk_exit_batch,
2971 : };
2972 :
2973 : #if defined(CONFIG_BPF_SYSCALL) && defined(CONFIG_PROC_FS)
2974 : DEFINE_BPF_ITER_FUNC(tcp, struct bpf_iter_meta *meta,
2975 : struct sock_common *sk_common, uid_t uid)
2976 :
2977 : static int bpf_iter_init_tcp(void *priv_data, struct bpf_iter_aux_info *aux)
2978 : {
2979 : struct tcp_iter_state *st = priv_data;
2980 : struct tcp_seq_afinfo *afinfo;
2981 : int ret;
2982 :
2983 : afinfo = kmalloc(sizeof(*afinfo), GFP_USER | __GFP_NOWARN);
2984 : if (!afinfo)
2985 : return -ENOMEM;
2986 :
2987 : afinfo->family = AF_UNSPEC;
2988 : st->bpf_seq_afinfo = afinfo;
2989 : ret = bpf_iter_init_seq_net(priv_data, aux);
2990 : if (ret)
2991 : kfree(afinfo);
2992 : return ret;
2993 : }
2994 :
2995 : static void bpf_iter_fini_tcp(void *priv_data)
2996 : {
2997 : struct tcp_iter_state *st = priv_data;
2998 :
2999 : kfree(st->bpf_seq_afinfo);
3000 : bpf_iter_fini_seq_net(priv_data);
3001 : }
3002 :
3003 : static const struct bpf_iter_seq_info tcp_seq_info = {
3004 : .seq_ops = &bpf_iter_tcp_seq_ops,
3005 : .init_seq_private = bpf_iter_init_tcp,
3006 : .fini_seq_private = bpf_iter_fini_tcp,
3007 : .seq_priv_size = sizeof(struct tcp_iter_state),
3008 : };
3009 :
3010 : static struct bpf_iter_reg tcp_reg_info = {
3011 : .target = "tcp",
3012 : .ctx_arg_info_size = 1,
3013 : .ctx_arg_info = {
3014 : { offsetof(struct bpf_iter__tcp, sk_common),
3015 : PTR_TO_BTF_ID_OR_NULL },
3016 : },
3017 : .seq_info = &tcp_seq_info,
3018 : };
3019 :
3020 : static void __init bpf_iter_register(void)
3021 : {
3022 : tcp_reg_info.ctx_arg_info[0].btf_id = btf_sock_ids[BTF_SOCK_TYPE_SOCK_COMMON];
3023 : if (bpf_iter_reg_target(&tcp_reg_info))
3024 : pr_warn("Warning: could not register bpf iterator tcp\n");
3025 : }
3026 :
3027 : #endif
3028 :
3029 1 : void __init tcp_v4_init(void)
3030 : {
3031 1 : if (register_pernet_subsys(&tcp_sk_ops))
3032 0 : panic("Failed to create the TCP control socket.\n");
3033 :
3034 : #if defined(CONFIG_BPF_SYSCALL) && defined(CONFIG_PROC_FS)
3035 : bpf_iter_register();
3036 : #endif
3037 1 : }
|