Line data Source code
1 : /* SPDX-License-Identifier: GPL-2.0-or-later */
2 : /*
3 : * INET An implementation of the TCP/IP protocol suite for the LINUX
4 : * operating system. INET is implemented using the BSD Socket
5 : * interface as the means of communication with the user level.
6 : *
7 : * Definitions for the TCP module.
8 : *
9 : * Version: @(#)tcp.h 1.0.5 05/23/93
10 : *
11 : * Authors: Ross Biro
12 : * Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
13 : */
14 : #ifndef _TCP_H
15 : #define _TCP_H
16 :
17 : #define FASTRETRANS_DEBUG 1
18 :
19 : #include <linux/list.h>
20 : #include <linux/tcp.h>
21 : #include <linux/bug.h>
22 : #include <linux/slab.h>
23 : #include <linux/cache.h>
24 : #include <linux/percpu.h>
25 : #include <linux/skbuff.h>
26 : #include <linux/kref.h>
27 : #include <linux/ktime.h>
28 : #include <linux/indirect_call_wrapper.h>
29 :
30 : #include <net/inet_connection_sock.h>
31 : #include <net/inet_timewait_sock.h>
32 : #include <net/inet_hashtables.h>
33 : #include <net/checksum.h>
34 : #include <net/request_sock.h>
35 : #include <net/sock_reuseport.h>
36 : #include <net/sock.h>
37 : #include <net/snmp.h>
38 : #include <net/ip.h>
39 : #include <net/tcp_states.h>
40 : #include <net/inet_ecn.h>
41 : #include <net/dst.h>
42 : #include <net/mptcp.h>
43 :
44 : #include <linux/seq_file.h>
45 : #include <linux/memcontrol.h>
46 : #include <linux/bpf-cgroup.h>
47 : #include <linux/siphash.h>
48 :
49 : extern struct inet_hashinfo tcp_hashinfo;
50 :
51 : extern struct percpu_counter tcp_orphan_count;
52 : void tcp_time_wait(struct sock *sk, int state, int timeo);
53 :
54 : #define MAX_TCP_HEADER L1_CACHE_ALIGN(128 + MAX_HEADER)
55 : #define MAX_TCP_OPTION_SPACE 40
56 : #define TCP_MIN_SND_MSS 48
57 : #define TCP_MIN_GSO_SIZE (TCP_MIN_SND_MSS - MAX_TCP_OPTION_SPACE)
58 :
59 : /*
60 : * Never offer a window over 32767 without using window scaling. Some
61 : * poor stacks do signed 16bit maths!
62 : */
63 : #define MAX_TCP_WINDOW 32767U
64 :
65 : /* Minimal accepted MSS. It is (60+60+8) - (20+20). */
66 : #define TCP_MIN_MSS 88U
67 :
68 : /* The initial MTU to use for probing */
69 : #define TCP_BASE_MSS 1024
70 :
71 : /* probing interval, default to 10 minutes as per RFC4821 */
72 : #define TCP_PROBE_INTERVAL 600
73 :
74 : /* Specify interval when tcp mtu probing will stop */
75 : #define TCP_PROBE_THRESHOLD 8
76 :
77 : /* After receiving this amount of duplicate ACKs fast retransmit starts. */
78 : #define TCP_FASTRETRANS_THRESH 3
79 :
80 : /* Maximal number of ACKs sent quickly to accelerate slow-start. */
81 : #define TCP_MAX_QUICKACKS 16U
82 :
83 : /* Maximal number of window scale according to RFC1323 */
84 : #define TCP_MAX_WSCALE 14U
85 :
86 : /* urg_data states */
87 : #define TCP_URG_VALID 0x0100
88 : #define TCP_URG_NOTYET 0x0200
89 : #define TCP_URG_READ 0x0400
90 :
91 : #define TCP_RETR1 3 /*
92 : * This is how many retries it does before it
93 : * tries to figure out if the gateway is
94 : * down. Minimal RFC value is 3; it corresponds
95 : * to ~3sec-8min depending on RTO.
96 : */
97 :
98 : #define TCP_RETR2 15 /*
99 : * This should take at least
100 : * 90 minutes to time out.
101 : * RFC1122 says that the limit is 100 sec.
102 : * 15 is ~13-30min depending on RTO.
103 : */
104 :
105 : #define TCP_SYN_RETRIES 6 /* This is how many retries are done
106 : * when active opening a connection.
107 : * RFC1122 says the minimum retry MUST
108 : * be at least 180secs. Nevertheless
109 : * this value is corresponding to
110 : * 63secs of retransmission with the
111 : * current initial RTO.
112 : */
113 :
114 : #define TCP_SYNACK_RETRIES 5 /* This is how may retries are done
115 : * when passive opening a connection.
116 : * This is corresponding to 31secs of
117 : * retransmission with the current
118 : * initial RTO.
119 : */
120 :
121 : #define TCP_TIMEWAIT_LEN (60*HZ) /* how long to wait to destroy TIME-WAIT
122 : * state, about 60 seconds */
123 : #define TCP_FIN_TIMEOUT TCP_TIMEWAIT_LEN
124 : /* BSD style FIN_WAIT2 deadlock breaker.
125 : * It used to be 3min, new value is 60sec,
126 : * to combine FIN-WAIT-2 timeout with
127 : * TIME-WAIT timer.
128 : */
129 : #define TCP_FIN_TIMEOUT_MAX (120 * HZ) /* max TCP_LINGER2 value (two minutes) */
130 :
131 : #define TCP_DELACK_MAX ((unsigned)(HZ/5)) /* maximal time to delay before sending an ACK */
132 : #if HZ >= 100
133 : #define TCP_DELACK_MIN ((unsigned)(HZ/25)) /* minimal time to delay before sending an ACK */
134 : #define TCP_ATO_MIN ((unsigned)(HZ/25))
135 : #else
136 : #define TCP_DELACK_MIN 4U
137 : #define TCP_ATO_MIN 4U
138 : #endif
139 : #define TCP_RTO_MAX ((unsigned)(120*HZ))
140 : #define TCP_RTO_MIN ((unsigned)(HZ/5))
141 : #define TCP_TIMEOUT_MIN (2U) /* Min timeout for TCP timers in jiffies */
142 : #define TCP_TIMEOUT_INIT ((unsigned)(1*HZ)) /* RFC6298 2.1 initial RTO value */
143 : #define TCP_TIMEOUT_FALLBACK ((unsigned)(3*HZ)) /* RFC 1122 initial RTO value, now
144 : * used as a fallback RTO for the
145 : * initial data transmission if no
146 : * valid RTT sample has been acquired,
147 : * most likely due to retrans in 3WHS.
148 : */
149 :
150 : #define TCP_RESOURCE_PROBE_INTERVAL ((unsigned)(HZ/2U)) /* Maximal interval between probes
151 : * for local resources.
152 : */
153 : #define TCP_KEEPALIVE_TIME (120*60*HZ) /* two hours */
154 : #define TCP_KEEPALIVE_PROBES 9 /* Max of 9 keepalive probes */
155 : #define TCP_KEEPALIVE_INTVL (75*HZ)
156 :
157 : #define MAX_TCP_KEEPIDLE 32767
158 : #define MAX_TCP_KEEPINTVL 32767
159 : #define MAX_TCP_KEEPCNT 127
160 : #define MAX_TCP_SYNCNT 127
161 :
162 : #define TCP_SYNQ_INTERVAL (HZ/5) /* Period of SYNACK timer */
163 :
164 : #define TCP_PAWS_24DAYS (60 * 60 * 24 * 24)
165 : #define TCP_PAWS_MSL 60 /* Per-host timestamps are invalidated
166 : * after this time. It should be equal
167 : * (or greater than) TCP_TIMEWAIT_LEN
168 : * to provide reliability equal to one
169 : * provided by timewait state.
170 : */
171 : #define TCP_PAWS_WINDOW 1 /* Replay window for per-host
172 : * timestamps. It must be less than
173 : * minimal timewait lifetime.
174 : */
175 : /*
176 : * TCP option
177 : */
178 :
179 : #define TCPOPT_NOP 1 /* Padding */
180 : #define TCPOPT_EOL 0 /* End of options */
181 : #define TCPOPT_MSS 2 /* Segment size negotiating */
182 : #define TCPOPT_WINDOW 3 /* Window scaling */
183 : #define TCPOPT_SACK_PERM 4 /* SACK Permitted */
184 : #define TCPOPT_SACK 5 /* SACK Block */
185 : #define TCPOPT_TIMESTAMP 8 /* Better RTT estimations/PAWS */
186 : #define TCPOPT_MD5SIG 19 /* MD5 Signature (RFC2385) */
187 : #define TCPOPT_MPTCP 30 /* Multipath TCP (RFC6824) */
188 : #define TCPOPT_FASTOPEN 34 /* Fast open (RFC7413) */
189 : #define TCPOPT_EXP 254 /* Experimental */
190 : /* Magic number to be after the option value for sharing TCP
191 : * experimental options. See draft-ietf-tcpm-experimental-options-00.txt
192 : */
193 : #define TCPOPT_FASTOPEN_MAGIC 0xF989
194 : #define TCPOPT_SMC_MAGIC 0xE2D4C3D9
195 :
196 : /*
197 : * TCP option lengths
198 : */
199 :
200 : #define TCPOLEN_MSS 4
201 : #define TCPOLEN_WINDOW 3
202 : #define TCPOLEN_SACK_PERM 2
203 : #define TCPOLEN_TIMESTAMP 10
204 : #define TCPOLEN_MD5SIG 18
205 : #define TCPOLEN_FASTOPEN_BASE 2
206 : #define TCPOLEN_EXP_FASTOPEN_BASE 4
207 : #define TCPOLEN_EXP_SMC_BASE 6
208 :
209 : /* But this is what stacks really send out. */
210 : #define TCPOLEN_TSTAMP_ALIGNED 12
211 : #define TCPOLEN_WSCALE_ALIGNED 4
212 : #define TCPOLEN_SACKPERM_ALIGNED 4
213 : #define TCPOLEN_SACK_BASE 2
214 : #define TCPOLEN_SACK_BASE_ALIGNED 4
215 : #define TCPOLEN_SACK_PERBLOCK 8
216 : #define TCPOLEN_MD5SIG_ALIGNED 20
217 : #define TCPOLEN_MSS_ALIGNED 4
218 : #define TCPOLEN_EXP_SMC_BASE_ALIGNED 8
219 :
220 : /* Flags in tp->nonagle */
221 : #define TCP_NAGLE_OFF 1 /* Nagle's algo is disabled */
222 : #define TCP_NAGLE_CORK 2 /* Socket is corked */
223 : #define TCP_NAGLE_PUSH 4 /* Cork is overridden for already queued data */
224 :
225 : /* TCP thin-stream limits */
226 : #define TCP_THIN_LINEAR_RETRIES 6 /* After 6 linear retries, do exp. backoff */
227 :
228 : /* TCP initial congestion window as per rfc6928 */
229 : #define TCP_INIT_CWND 10
230 :
231 : /* Bit Flags for sysctl_tcp_fastopen */
232 : #define TFO_CLIENT_ENABLE 1
233 : #define TFO_SERVER_ENABLE 2
234 : #define TFO_CLIENT_NO_COOKIE 4 /* Data in SYN w/o cookie option */
235 :
236 : /* Accept SYN data w/o any cookie option */
237 : #define TFO_SERVER_COOKIE_NOT_REQD 0x200
238 :
239 : /* Force enable TFO on all listeners, i.e., not requiring the
240 : * TCP_FASTOPEN socket option.
241 : */
242 : #define TFO_SERVER_WO_SOCKOPT1 0x400
243 :
244 :
245 : /* sysctl variables for tcp */
246 : extern int sysctl_tcp_max_orphans;
247 : extern long sysctl_tcp_mem[3];
248 :
249 : #define TCP_RACK_LOSS_DETECTION 0x1 /* Use RACK to detect losses */
250 : #define TCP_RACK_STATIC_REO_WND 0x2 /* Use static RACK reo wnd */
251 : #define TCP_RACK_NO_DUPTHRESH 0x4 /* Do not use DUPACK threshold in RACK */
252 :
253 : extern atomic_long_t tcp_memory_allocated;
254 : extern struct percpu_counter tcp_sockets_allocated;
255 : extern unsigned long tcp_memory_pressure;
256 :
257 : /* optimized version of sk_under_memory_pressure() for TCP sockets */
258 381 : static inline bool tcp_under_memory_pressure(const struct sock *sk)
259 : {
260 381 : if (mem_cgroup_sockets_enabled && sk->sk_memcg &&
261 : mem_cgroup_under_socket_pressure(sk->sk_memcg))
262 : return true;
263 :
264 381 : return READ_ONCE(tcp_memory_pressure);
265 : }
266 : /*
267 : * The next routines deal with comparing 32 bit unsigned ints
268 : * and worry about wraparound (automatic with unsigned arithmetic).
269 : */
270 :
271 5951 : static inline bool before(__u32 seq1, __u32 seq2)
272 : {
273 4739 : return (__s32)(seq1-seq2) < 0;
274 : }
275 : #define after(seq2, seq1) before(seq1, seq2)
276 :
277 : /* is s2<=s1<=s3 ? */
278 351 : static inline bool between(__u32 seq1, __u32 seq2, __u32 seq3)
279 : {
280 351 : return seq3 - seq2 >= seq1 - seq2;
281 : }
282 :
283 3 : static inline bool tcp_out_of_memory(struct sock *sk)
284 : {
285 3 : if (sk->sk_wmem_queued > SOCK_MIN_SNDBUF &&
286 0 : sk_memory_allocated(sk) > sk_prot_mem_limits(sk, 2))
287 0 : return true;
288 : return false;
289 : }
290 :
291 : void sk_forced_mem_schedule(struct sock *sk, int size);
292 :
293 3 : static inline bool tcp_too_many_orphans(struct sock *sk, int shift)
294 : {
295 3 : struct percpu_counter *ocp = sk->sk_prot->orphan_count;
296 3 : int orphans = percpu_counter_read_positive(ocp);
297 :
298 3 : if (orphans << shift > sysctl_tcp_max_orphans) {
299 0 : orphans = percpu_counter_sum_positive(ocp);
300 0 : if (orphans << shift > sysctl_tcp_max_orphans)
301 0 : return true;
302 : }
303 : return false;
304 : }
305 :
306 : bool tcp_check_oom(struct sock *sk, int shift);
307 :
308 :
309 : extern struct proto tcp_prot;
310 :
311 : #define TCP_INC_STATS(net, field) SNMP_INC_STATS((net)->mib.tcp_statistics, field)
312 : #define __TCP_INC_STATS(net, field) __SNMP_INC_STATS((net)->mib.tcp_statistics, field)
313 : #define TCP_DEC_STATS(net, field) SNMP_DEC_STATS((net)->mib.tcp_statistics, field)
314 : #define TCP_ADD_STATS(net, field, val) SNMP_ADD_STATS((net)->mib.tcp_statistics, field, val)
315 :
316 : void tcp_tasklet_init(void);
317 :
318 : int tcp_v4_err(struct sk_buff *skb, u32);
319 :
320 : void tcp_shutdown(struct sock *sk, int how);
321 :
322 : int tcp_v4_early_demux(struct sk_buff *skb);
323 : int tcp_v4_rcv(struct sk_buff *skb);
324 :
325 : void tcp_remove_empty_skb(struct sock *sk, struct sk_buff *skb);
326 : int tcp_v4_tw_remember_stamp(struct inet_timewait_sock *tw);
327 : int tcp_sendmsg(struct sock *sk, struct msghdr *msg, size_t size);
328 : int tcp_sendmsg_locked(struct sock *sk, struct msghdr *msg, size_t size);
329 : int tcp_sendpage(struct sock *sk, struct page *page, int offset, size_t size,
330 : int flags);
331 : int tcp_sendpage_locked(struct sock *sk, struct page *page, int offset,
332 : size_t size, int flags);
333 : struct sk_buff *tcp_build_frag(struct sock *sk, int size_goal, int flags,
334 : struct page *page, int offset, size_t *size);
335 : ssize_t do_tcp_sendpages(struct sock *sk, struct page *page, int offset,
336 : size_t size, int flags);
337 : int tcp_send_mss(struct sock *sk, int *size_goal, int flags);
338 : void tcp_push(struct sock *sk, int flags, int mss_now, int nonagle,
339 : int size_goal);
340 : void tcp_release_cb(struct sock *sk);
341 : void tcp_wfree(struct sk_buff *skb);
342 : void tcp_write_timer_handler(struct sock *sk);
343 : void tcp_delack_timer_handler(struct sock *sk);
344 : int tcp_ioctl(struct sock *sk, int cmd, unsigned long arg);
345 : int tcp_rcv_state_process(struct sock *sk, struct sk_buff *skb);
346 : void tcp_rcv_established(struct sock *sk, struct sk_buff *skb);
347 : void tcp_rcv_space_adjust(struct sock *sk);
348 : int tcp_twsk_unique(struct sock *sk, struct sock *sktw, void *twp);
349 : void tcp_twsk_destructor(struct sock *sk);
350 : ssize_t tcp_splice_read(struct socket *sk, loff_t *ppos,
351 : struct pipe_inode_info *pipe, size_t len,
352 : unsigned int flags);
353 :
354 : void tcp_enter_quickack_mode(struct sock *sk, unsigned int max_quickacks);
355 426 : static inline void tcp_dec_quickack_mode(struct sock *sk,
356 : const unsigned int pkts)
357 : {
358 426 : struct inet_connection_sock *icsk = inet_csk(sk);
359 :
360 426 : if (icsk->icsk_ack.quick) {
361 98 : if (pkts >= icsk->icsk_ack.quick) {
362 14 : icsk->icsk_ack.quick = 0;
363 : /* Leaving quickack mode we deflate ATO. */
364 14 : icsk->icsk_ack.ato = TCP_ATO_MIN;
365 : } else
366 84 : icsk->icsk_ack.quick -= pkts;
367 : }
368 : }
369 :
370 : #define TCP_ECN_OK 1
371 : #define TCP_ECN_QUEUE_CWR 2
372 : #define TCP_ECN_DEMAND_CWR 4
373 : #define TCP_ECN_SEEN 8
374 :
375 : enum tcp_tw_status {
376 : TCP_TW_SUCCESS = 0,
377 : TCP_TW_RST = 1,
378 : TCP_TW_ACK = 2,
379 : TCP_TW_SYN = 3
380 : };
381 :
382 :
383 : enum tcp_tw_status tcp_timewait_state_process(struct inet_timewait_sock *tw,
384 : struct sk_buff *skb,
385 : const struct tcphdr *th);
386 : struct sock *tcp_check_req(struct sock *sk, struct sk_buff *skb,
387 : struct request_sock *req, bool fastopen,
388 : bool *lost_race);
389 : int tcp_child_process(struct sock *parent, struct sock *child,
390 : struct sk_buff *skb);
391 : void tcp_enter_loss(struct sock *sk);
392 : void tcp_cwnd_reduction(struct sock *sk, int newly_acked_sacked, int newly_lost, int flag);
393 : void tcp_clear_retrans(struct tcp_sock *tp);
394 : void tcp_update_metrics(struct sock *sk);
395 : void tcp_init_metrics(struct sock *sk);
396 : void tcp_metrics_init(void);
397 : bool tcp_peer_is_proven(struct request_sock *req, struct dst_entry *dst);
398 : void __tcp_close(struct sock *sk, long timeout);
399 : void tcp_close(struct sock *sk, long timeout);
400 : void tcp_init_sock(struct sock *sk);
401 : void tcp_init_transfer(struct sock *sk, int bpf_op, struct sk_buff *skb);
402 : __poll_t tcp_poll(struct file *file, struct socket *sock,
403 : struct poll_table_struct *wait);
404 : int tcp_getsockopt(struct sock *sk, int level, int optname,
405 : char __user *optval, int __user *optlen);
406 : bool tcp_bpf_bypass_getsockopt(int level, int optname);
407 : int tcp_setsockopt(struct sock *sk, int level, int optname, sockptr_t optval,
408 : unsigned int optlen);
409 : void tcp_set_keepalive(struct sock *sk, int val);
410 : void tcp_syn_ack_timeout(const struct request_sock *req);
411 : int tcp_recvmsg(struct sock *sk, struct msghdr *msg, size_t len, int nonblock,
412 : int flags, int *addr_len);
413 : int tcp_set_rcvlowat(struct sock *sk, int val);
414 : int tcp_set_window_clamp(struct sock *sk, int val);
415 : void tcp_data_ready(struct sock *sk);
416 : #ifdef CONFIG_MMU
417 : int tcp_mmap(struct file *file, struct socket *sock,
418 : struct vm_area_struct *vma);
419 : #endif
420 : void tcp_parse_options(const struct net *net, const struct sk_buff *skb,
421 : struct tcp_options_received *opt_rx,
422 : int estab, struct tcp_fastopen_cookie *foc);
423 : const u8 *tcp_parse_md5sig_option(const struct tcphdr *th);
424 :
425 : /*
426 : * BPF SKB-less helpers
427 : */
428 : u16 tcp_v4_get_syncookie(struct sock *sk, struct iphdr *iph,
429 : struct tcphdr *th, u32 *cookie);
430 : u16 tcp_v6_get_syncookie(struct sock *sk, struct ipv6hdr *iph,
431 : struct tcphdr *th, u32 *cookie);
432 : u16 tcp_get_syncookie_mss(struct request_sock_ops *rsk_ops,
433 : const struct tcp_request_sock_ops *af_ops,
434 : struct sock *sk, struct tcphdr *th);
435 : /*
436 : * TCP v4 functions exported for the inet6 API
437 : */
438 :
439 : void tcp_v4_send_check(struct sock *sk, struct sk_buff *skb);
440 : void tcp_v4_mtu_reduced(struct sock *sk);
441 : void tcp_req_err(struct sock *sk, u32 seq, bool abort);
442 : void tcp_ld_RTO_revert(struct sock *sk, u32 seq);
443 : int tcp_v4_conn_request(struct sock *sk, struct sk_buff *skb);
444 : struct sock *tcp_create_openreq_child(const struct sock *sk,
445 : struct request_sock *req,
446 : struct sk_buff *skb);
447 : void tcp_ca_openreq_child(struct sock *sk, const struct dst_entry *dst);
448 : struct sock *tcp_v4_syn_recv_sock(const struct sock *sk, struct sk_buff *skb,
449 : struct request_sock *req,
450 : struct dst_entry *dst,
451 : struct request_sock *req_unhash,
452 : bool *own_req);
453 : int tcp_v4_do_rcv(struct sock *sk, struct sk_buff *skb);
454 : int tcp_v4_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len);
455 : int tcp_connect(struct sock *sk);
456 : enum tcp_synack_type {
457 : TCP_SYNACK_NORMAL,
458 : TCP_SYNACK_FASTOPEN,
459 : TCP_SYNACK_COOKIE,
460 : };
461 : struct sk_buff *tcp_make_synack(const struct sock *sk, struct dst_entry *dst,
462 : struct request_sock *req,
463 : struct tcp_fastopen_cookie *foc,
464 : enum tcp_synack_type synack_type,
465 : struct sk_buff *syn_skb);
466 : int tcp_disconnect(struct sock *sk, int flags);
467 :
468 : void tcp_finish_connect(struct sock *sk, struct sk_buff *skb);
469 : int tcp_send_rcvq(struct sock *sk, struct msghdr *msg, size_t size);
470 : void inet_sk_rx_dst_set(struct sock *sk, const struct sk_buff *skb);
471 :
472 : /* From syncookies.c */
473 : struct sock *tcp_get_cookie_sock(struct sock *sk, struct sk_buff *skb,
474 : struct request_sock *req,
475 : struct dst_entry *dst, u32 tsoff);
476 : int __cookie_v4_check(const struct iphdr *iph, const struct tcphdr *th,
477 : u32 cookie);
478 : struct sock *cookie_v4_check(struct sock *sk, struct sk_buff *skb);
479 : struct request_sock *cookie_tcp_reqsk_alloc(const struct request_sock_ops *ops,
480 : struct sock *sk, struct sk_buff *skb);
481 : #ifdef CONFIG_SYN_COOKIES
482 :
483 : /* Syncookies use a monotonic timer which increments every 60 seconds.
484 : * This counter is used both as a hash input and partially encoded into
485 : * the cookie value. A cookie is only validated further if the delta
486 : * between the current counter value and the encoded one is less than this,
487 : * i.e. a sent cookie is valid only at most for 2*60 seconds (or less if
488 : * the counter advances immediately after a cookie is generated).
489 : */
490 : #define MAX_SYNCOOKIE_AGE 2
491 : #define TCP_SYNCOOKIE_PERIOD (60 * HZ)
492 : #define TCP_SYNCOOKIE_VALID (MAX_SYNCOOKIE_AGE * TCP_SYNCOOKIE_PERIOD)
493 :
494 : /* syncookies: remember time of last synqueue overflow
495 : * But do not dirty this field too often (once per second is enough)
496 : * It is racy as we do not hold a lock, but race is very minor.
497 : */
498 : static inline void tcp_synq_overflow(const struct sock *sk)
499 : {
500 : unsigned int last_overflow;
501 : unsigned int now = jiffies;
502 :
503 : if (sk->sk_reuseport) {
504 : struct sock_reuseport *reuse;
505 :
506 : reuse = rcu_dereference(sk->sk_reuseport_cb);
507 : if (likely(reuse)) {
508 : last_overflow = READ_ONCE(reuse->synq_overflow_ts);
509 : if (!time_between32(now, last_overflow,
510 : last_overflow + HZ))
511 : WRITE_ONCE(reuse->synq_overflow_ts, now);
512 : return;
513 : }
514 : }
515 :
516 : last_overflow = READ_ONCE(tcp_sk(sk)->rx_opt.ts_recent_stamp);
517 : if (!time_between32(now, last_overflow, last_overflow + HZ))
518 : WRITE_ONCE(tcp_sk(sk)->rx_opt.ts_recent_stamp, now);
519 : }
520 :
521 : /* syncookies: no recent synqueue overflow on this listening socket? */
522 : static inline bool tcp_synq_no_recent_overflow(const struct sock *sk)
523 : {
524 : unsigned int last_overflow;
525 : unsigned int now = jiffies;
526 :
527 : if (sk->sk_reuseport) {
528 : struct sock_reuseport *reuse;
529 :
530 : reuse = rcu_dereference(sk->sk_reuseport_cb);
531 : if (likely(reuse)) {
532 : last_overflow = READ_ONCE(reuse->synq_overflow_ts);
533 : return !time_between32(now, last_overflow - HZ,
534 : last_overflow +
535 : TCP_SYNCOOKIE_VALID);
536 : }
537 : }
538 :
539 : last_overflow = READ_ONCE(tcp_sk(sk)->rx_opt.ts_recent_stamp);
540 :
541 : /* If last_overflow <= jiffies <= last_overflow + TCP_SYNCOOKIE_VALID,
542 : * then we're under synflood. However, we have to use
543 : * 'last_overflow - HZ' as lower bound. That's because a concurrent
544 : * tcp_synq_overflow() could update .ts_recent_stamp after we read
545 : * jiffies but before we store .ts_recent_stamp into last_overflow,
546 : * which could lead to rejecting a valid syncookie.
547 : */
548 : return !time_between32(now, last_overflow - HZ,
549 : last_overflow + TCP_SYNCOOKIE_VALID);
550 : }
551 :
552 : static inline u32 tcp_cookie_time(void)
553 : {
554 : u64 val = get_jiffies_64();
555 :
556 : do_div(val, TCP_SYNCOOKIE_PERIOD);
557 : return val;
558 : }
559 :
560 : u32 __cookie_v4_init_sequence(const struct iphdr *iph, const struct tcphdr *th,
561 : u16 *mssp);
562 : __u32 cookie_v4_init_sequence(const struct sk_buff *skb, __u16 *mss);
563 : u64 cookie_init_timestamp(struct request_sock *req, u64 now);
564 : bool cookie_timestamp_decode(const struct net *net,
565 : struct tcp_options_received *opt);
566 : bool cookie_ecn_ok(const struct tcp_options_received *opt,
567 : const struct net *net, const struct dst_entry *dst);
568 :
569 : /* From net/ipv6/syncookies.c */
570 : int __cookie_v6_check(const struct ipv6hdr *iph, const struct tcphdr *th,
571 : u32 cookie);
572 : struct sock *cookie_v6_check(struct sock *sk, struct sk_buff *skb);
573 :
574 : u32 __cookie_v6_init_sequence(const struct ipv6hdr *iph,
575 : const struct tcphdr *th, u16 *mssp);
576 : __u32 cookie_v6_init_sequence(const struct sk_buff *skb, __u16 *mss);
577 : #endif
578 : /* tcp_output.c */
579 :
580 : void __tcp_push_pending_frames(struct sock *sk, unsigned int cur_mss,
581 : int nonagle);
582 : int __tcp_retransmit_skb(struct sock *sk, struct sk_buff *skb, int segs);
583 : int tcp_retransmit_skb(struct sock *sk, struct sk_buff *skb, int segs);
584 : void tcp_retransmit_timer(struct sock *sk);
585 : void tcp_xmit_retransmit_queue(struct sock *);
586 : void tcp_simple_retransmit(struct sock *);
587 : void tcp_enter_recovery(struct sock *sk, bool ece_ack);
588 : int tcp_trim_head(struct sock *, struct sk_buff *, u32);
589 : enum tcp_queue {
590 : TCP_FRAG_IN_WRITE_QUEUE,
591 : TCP_FRAG_IN_RTX_QUEUE,
592 : };
593 : int tcp_fragment(struct sock *sk, enum tcp_queue tcp_queue,
594 : struct sk_buff *skb, u32 len,
595 : unsigned int mss_now, gfp_t gfp);
596 :
597 : void tcp_send_probe0(struct sock *);
598 : void tcp_send_partial(struct sock *);
599 : int tcp_write_wakeup(struct sock *, int mib);
600 : void tcp_send_fin(struct sock *sk);
601 : void tcp_send_active_reset(struct sock *sk, gfp_t priority);
602 : int tcp_send_synack(struct sock *);
603 : void tcp_push_one(struct sock *, unsigned int mss_now);
604 : void __tcp_send_ack(struct sock *sk, u32 rcv_nxt);
605 : void tcp_send_ack(struct sock *sk);
606 : void tcp_send_delayed_ack(struct sock *sk);
607 : void tcp_send_loss_probe(struct sock *sk);
608 : bool tcp_schedule_loss_probe(struct sock *sk, bool advancing_rto);
609 : void tcp_skb_collapse_tstamp(struct sk_buff *skb,
610 : const struct sk_buff *next_skb);
611 :
612 : /* tcp_input.c */
613 : void tcp_rearm_rto(struct sock *sk);
614 : void tcp_synack_rtt_meas(struct sock *sk, struct request_sock *req);
615 : void tcp_reset(struct sock *sk, struct sk_buff *skb);
616 : void tcp_skb_mark_lost_uncond_verify(struct tcp_sock *tp, struct sk_buff *skb);
617 : void tcp_fin(struct sock *sk);
618 :
619 : /* tcp_timer.c */
620 : void tcp_init_xmit_timers(struct sock *);
621 7 : static inline void tcp_clear_xmit_timers(struct sock *sk)
622 : {
623 7 : if (hrtimer_try_to_cancel(&tcp_sk(sk)->pacing_timer) == 1)
624 0 : __sock_put(sk);
625 :
626 7 : if (hrtimer_try_to_cancel(&tcp_sk(sk)->compressed_ack_timer) == 1)
627 0 : __sock_put(sk);
628 :
629 7 : inet_csk_clear_xmit_timers(sk);
630 7 : }
631 :
632 : unsigned int tcp_sync_mss(struct sock *sk, u32 pmtu);
633 : unsigned int tcp_current_mss(struct sock *sk);
634 : u32 tcp_clamp_probe0_to_user_timeout(const struct sock *sk, u32 when);
635 :
636 : /* Bound MSS / TSO packet size with the half of the window */
637 415 : static inline int tcp_bound_to_half_wnd(struct tcp_sock *tp, int pktsize)
638 : {
639 415 : int cutoff;
640 :
641 : /* When peer uses tiny windows, there is no use in packetizing
642 : * to sub-MSS pieces for the sake of SWS or making sure there
643 : * are enough packets in the pipe for fast recovery.
644 : *
645 : * On the other hand, for extremely large MSS devices, handling
646 : * smaller than MSS windows in this way does make sense.
647 : */
648 415 : if (tp->max_window > TCP_MSS_DEFAULT)
649 415 : cutoff = (tp->max_window >> 1);
650 : else
651 0 : cutoff = tp->max_window;
652 :
653 415 : if (cutoff && pktsize > cutoff)
654 411 : return max_t(int, cutoff, 68U - tp->tcp_header_len);
655 : else
656 : return pktsize;
657 : }
658 :
659 : /* tcp.c */
660 : void tcp_get_info(struct sock *, struct tcp_info *);
661 :
662 : /* Read 'sendfile()'-style from a TCP socket */
663 : int tcp_read_sock(struct sock *sk, read_descriptor_t *desc,
664 : sk_read_actor_t recv_actor);
665 :
666 : void tcp_initialize_rcv_mss(struct sock *sk);
667 :
668 : int tcp_mtu_to_mss(struct sock *sk, int pmtu);
669 : int tcp_mss_to_mtu(struct sock *sk, int mss);
670 : void tcp_mtup_init(struct sock *sk);
671 :
672 355 : static inline void tcp_bound_rto(const struct sock *sk)
673 : {
674 355 : if (inet_csk(sk)->icsk_rto > TCP_RTO_MAX)
675 0 : inet_csk(sk)->icsk_rto = TCP_RTO_MAX;
676 : }
677 :
678 355 : static inline u32 __tcp_set_rto(const struct tcp_sock *tp)
679 : {
680 355 : return usecs_to_jiffies((tp->srtt_us >> 3) + tp->rttvar_us);
681 : }
682 :
683 19 : static inline void __tcp_fast_path_on(struct tcp_sock *tp, u32 snd_wnd)
684 : {
685 19 : tp->pred_flags = htonl((tp->tcp_header_len << 26) |
686 : ntohl(TCP_FLAG_ACK) |
687 : snd_wnd);
688 0 : }
689 :
690 19 : static inline void tcp_fast_path_on(struct tcp_sock *tp)
691 : {
692 19 : __tcp_fast_path_on(tp, tp->snd_wnd >> tp->rx_opt.snd_wscale);
693 4 : }
694 :
695 15 : static inline void tcp_fast_path_check(struct sock *sk)
696 : {
697 15 : struct tcp_sock *tp = tcp_sk(sk);
698 :
699 15 : if (RB_EMPTY_ROOT(&tp->out_of_order_queue) &&
700 15 : tp->rcv_wnd &&
701 15 : atomic_read(&sk->sk_rmem_alloc) < sk->sk_rcvbuf &&
702 15 : !tp->urg_data)
703 15 : tcp_fast_path_on(tp);
704 15 : }
705 :
706 : /* Compute the actual rto_min value */
707 259 : static inline u32 tcp_rto_min(struct sock *sk)
708 : {
709 259 : const struct dst_entry *dst = __sk_dst_get(sk);
710 259 : u32 rto_min = inet_csk(sk)->icsk_rto_min;
711 :
712 518 : if (dst && dst_metric_locked(dst, RTAX_RTO_MIN))
713 0 : rto_min = dst_metric_rtt(dst, RTAX_RTO_MIN);
714 259 : return rto_min;
715 : }
716 :
717 258 : static inline u32 tcp_rto_min_us(struct sock *sk)
718 : {
719 258 : return jiffies_to_usecs(tcp_rto_min(sk));
720 : }
721 :
722 0 : static inline bool tcp_ca_dst_locked(const struct dst_entry *dst)
723 : {
724 0 : return dst_metric_locked(dst, RTAX_CC_ALGO);
725 : }
726 :
727 : /* Minimum RTT in usec. ~0 means not available. */
728 450 : static inline u32 tcp_min_rtt(const struct tcp_sock *tp)
729 : {
730 450 : return minmax_get(&tp->rtt_min);
731 : }
732 :
733 : /* Compute the actual receive window we are currently advertising.
734 : * Rcv_nxt can be after the window if our peer push more data
735 : * than the offered window.
736 : */
737 647 : static inline u32 tcp_receive_window(const struct tcp_sock *tp)
738 : {
739 647 : s32 win = tp->rcv_wup + tp->rcv_wnd - tp->rcv_nxt;
740 :
741 647 : if (win < 0)
742 : win = 0;
743 647 : return (u32) win;
744 : }
745 :
746 : /* Choose a new window, without checks for shrinking, and without
747 : * scaling applied to the result. The caller does these things
748 : * if necessary. This is a "raw" window selection.
749 : */
750 : u32 __tcp_select_window(struct sock *sk);
751 :
752 : void tcp_send_window_probe(struct sock *sk);
753 :
754 : /* TCP uses 32bit jiffies to save some space.
755 : * Note that this is different from tcp_time_stamp, which
756 : * historically has been the same until linux-4.13.
757 : */
758 : #define tcp_jiffies32 ((u32)jiffies)
759 :
760 : /*
761 : * Deliver a 32bit value for TCP timestamp option (RFC 7323)
762 : * It is no longer tied to jiffies, but to 1 ms clock.
763 : * Note: double check if you want to use tcp_jiffies32 instead of this.
764 : */
765 : #define TCP_TS_HZ 1000
766 :
767 1070 : static inline u64 tcp_clock_ns(void)
768 : {
769 1070 : return ktime_get_ns();
770 : }
771 :
772 4 : static inline u64 tcp_clock_us(void)
773 : {
774 4 : return div_u64(tcp_clock_ns(), NSEC_PER_USEC);
775 : }
776 :
777 : /* This should only be used in contexts where tp->tcp_mstamp is up to date */
778 0 : static inline u32 tcp_time_stamp(const struct tcp_sock *tp)
779 : {
780 0 : return div_u64(tp->tcp_mstamp, USEC_PER_SEC / TCP_TS_HZ);
781 : }
782 :
783 : /* Convert a nsec timestamp into TCP TSval timestamp (ms based currently) */
784 0 : static inline u32 tcp_ns_to_ts(u64 ns)
785 : {
786 0 : return div_u64(ns, NSEC_PER_SEC / TCP_TS_HZ);
787 : }
788 :
789 : /* Could use tcp_clock_us() / 1000, but this version uses a single divide */
790 0 : static inline u32 tcp_time_stamp_raw(void)
791 : {
792 0 : return tcp_ns_to_ts(tcp_clock_ns());
793 : }
794 :
795 : void tcp_mstamp_refresh(struct tcp_sock *tp);
796 :
797 1276 : static inline u32 tcp_stamp_us_delta(u64 t1, u64 t0)
798 : {
799 1276 : return max_t(s64, t1 - t0, 0);
800 : }
801 :
802 0 : static inline u32 tcp_skb_timestamp(const struct sk_buff *skb)
803 : {
804 0 : return tcp_ns_to_ts(skb->skb_mstamp_ns);
805 : }
806 :
807 : /* provide the departure time in us unit */
808 954 : static inline u64 tcp_skb_timestamp_us(const struct sk_buff *skb)
809 : {
810 954 : return div_u64(skb->skb_mstamp_ns, NSEC_PER_USEC);
811 : }
812 :
813 :
814 : #define tcp_flag_byte(th) (((u_int8_t *)th)[13])
815 :
816 : #define TCPHDR_FIN 0x01
817 : #define TCPHDR_SYN 0x02
818 : #define TCPHDR_RST 0x04
819 : #define TCPHDR_PSH 0x08
820 : #define TCPHDR_ACK 0x10
821 : #define TCPHDR_URG 0x20
822 : #define TCPHDR_ECE 0x40
823 : #define TCPHDR_CWR 0x80
824 :
825 : #define TCPHDR_SYN_ECN (TCPHDR_SYN | TCPHDR_ECE | TCPHDR_CWR)
826 :
827 : /* This is what the send packet queuing engine uses to pass
828 : * TCP per-packet control information to the transmission code.
829 : * We also store the host-order sequence numbers in here too.
830 : * This is 44 bytes if IPV6 is enabled.
831 : * If this grows please adjust skbuff.h:skbuff->cb[xxx] size appropriately.
832 : */
833 : struct tcp_skb_cb {
834 : __u32 seq; /* Starting sequence number */
835 : __u32 end_seq; /* SEQ + FIN + SYN + datalen */
836 : union {
837 : /* Note : tcp_tw_isn is used in input path only
838 : * (isn chosen by tcp_timewait_state_process())
839 : *
840 : * tcp_gso_segs/size are used in write queue only,
841 : * cf tcp_skb_pcount()/tcp_skb_mss()
842 : */
843 : __u32 tcp_tw_isn;
844 : struct {
845 : u16 tcp_gso_segs;
846 : u16 tcp_gso_size;
847 : };
848 : };
849 : __u8 tcp_flags; /* TCP header flags. (tcp[13]) */
850 :
851 : __u8 sacked; /* State flags for SACK. */
852 : #define TCPCB_SACKED_ACKED 0x01 /* SKB ACK'd by a SACK block */
853 : #define TCPCB_SACKED_RETRANS 0x02 /* SKB retransmitted */
854 : #define TCPCB_LOST 0x04 /* SKB is lost */
855 : #define TCPCB_TAGBITS 0x07 /* All tag bits */
856 : #define TCPCB_REPAIRED 0x10 /* SKB repaired (no skb_mstamp_ns) */
857 : #define TCPCB_EVER_RETRANS 0x80 /* Ever retransmitted frame */
858 : #define TCPCB_RETRANS (TCPCB_SACKED_RETRANS|TCPCB_EVER_RETRANS| \
859 : TCPCB_REPAIRED)
860 :
861 : __u8 ip_dsfield; /* IPv4 tos or IPv6 dsfield */
862 : __u8 txstamp_ack:1, /* Record TX timestamp for ack? */
863 : eor:1, /* Is skb MSG_EOR marked? */
864 : has_rxtstamp:1, /* SKB has a RX timestamp */
865 : unused:5;
866 : __u32 ack_seq; /* Sequence number ACK'd */
867 : union {
868 : struct {
869 : /* There is space for up to 24 bytes */
870 : __u32 in_flight:30,/* Bytes in flight at transmit */
871 : is_app_limited:1, /* cwnd not fully used? */
872 : unused:1;
873 : /* pkts S/ACKed so far upon tx of skb, incl retrans: */
874 : __u32 delivered;
875 : /* start of send pipeline phase */
876 : u64 first_tx_mstamp;
877 : /* when we reached the "delivered" count */
878 : u64 delivered_mstamp;
879 : } tx; /* only used for outgoing skbs */
880 : union {
881 : struct inet_skb_parm h4;
882 : #if IS_ENABLED(CONFIG_IPV6)
883 : struct inet6_skb_parm h6;
884 : #endif
885 : } header; /* For incoming skbs */
886 : struct {
887 : __u32 flags;
888 : struct sock *sk_redir;
889 : void *data_end;
890 : } bpf;
891 : };
892 : };
893 :
894 : #define TCP_SKB_CB(__skb) ((struct tcp_skb_cb *)&((__skb)->cb[0]))
895 :
896 0 : static inline void bpf_compute_data_end_sk_skb(struct sk_buff *skb)
897 : {
898 0 : TCP_SKB_CB(skb)->bpf.data_end = skb->data + skb_headlen(skb);
899 : }
900 :
901 : static inline bool tcp_skb_bpf_ingress(const struct sk_buff *skb)
902 : {
903 : return TCP_SKB_CB(skb)->bpf.flags & BPF_F_INGRESS;
904 : }
905 :
906 : static inline struct sock *tcp_skb_bpf_redirect_fetch(struct sk_buff *skb)
907 : {
908 : return TCP_SKB_CB(skb)->bpf.sk_redir;
909 : }
910 :
911 : static inline void tcp_skb_bpf_redirect_clear(struct sk_buff *skb)
912 : {
913 : TCP_SKB_CB(skb)->bpf.sk_redir = NULL;
914 : }
915 :
916 : extern const struct inet_connection_sock_af_ops ipv4_specific;
917 :
918 : #if IS_ENABLED(CONFIG_IPV6)
919 : /* This is the variant of inet6_iif() that must be used by TCP,
920 : * as TCP moves IP6CB into a different location in skb->cb[]
921 : */
922 : static inline int tcp_v6_iif(const struct sk_buff *skb)
923 : {
924 : return TCP_SKB_CB(skb)->header.h6.iif;
925 : }
926 :
927 : static inline int tcp_v6_iif_l3_slave(const struct sk_buff *skb)
928 : {
929 : bool l3_slave = ipv6_l3mdev_skb(TCP_SKB_CB(skb)->header.h6.flags);
930 :
931 : return l3_slave ? skb->skb_iif : TCP_SKB_CB(skb)->header.h6.iif;
932 : }
933 :
934 : /* TCP_SKB_CB reference means this can not be used from early demux */
935 : static inline int tcp_v6_sdif(const struct sk_buff *skb)
936 : {
937 : #if IS_ENABLED(CONFIG_NET_L3_MASTER_DEV)
938 : if (skb && ipv6_l3mdev_skb(TCP_SKB_CB(skb)->header.h6.flags))
939 : return TCP_SKB_CB(skb)->header.h6.iif;
940 : #endif
941 : return 0;
942 : }
943 :
944 : extern const struct inet_connection_sock_af_ops ipv6_specific;
945 :
946 : INDIRECT_CALLABLE_DECLARE(void tcp_v6_send_check(struct sock *sk, struct sk_buff *skb));
947 : INDIRECT_CALLABLE_DECLARE(int tcp_v6_rcv(struct sk_buff *skb));
948 : INDIRECT_CALLABLE_DECLARE(void tcp_v6_early_demux(struct sk_buff *skb));
949 :
950 : #endif
951 :
952 : /* TCP_SKB_CB reference means this can not be used from early demux */
953 0 : static inline int tcp_v4_sdif(struct sk_buff *skb)
954 : {
955 : #if IS_ENABLED(CONFIG_NET_L3_MASTER_DEV)
956 : if (skb && ipv4_l3mdev_skb(TCP_SKB_CB(skb)->header.h4.flags))
957 : return TCP_SKB_CB(skb)->header.h4.iif;
958 : #endif
959 0 : return 0;
960 : }
961 :
962 : /* Due to TSO, an SKB can be composed of multiple actual
963 : * packets. To keep these tracked properly, we use this.
964 : */
965 5584 : static inline int tcp_skb_pcount(const struct sk_buff *skb)
966 : {
967 5158 : return TCP_SKB_CB(skb)->tcp_gso_segs;
968 : }
969 :
970 837 : static inline void tcp_skb_pcount_set(struct sk_buff *skb, int segs)
971 : {
972 837 : TCP_SKB_CB(skb)->tcp_gso_segs = segs;
973 : }
974 :
975 0 : static inline void tcp_skb_pcount_add(struct sk_buff *skb, int segs)
976 : {
977 0 : TCP_SKB_CB(skb)->tcp_gso_segs += segs;
978 : }
979 :
980 : /* This is valid iff skb is in write queue and tcp_skb_pcount() > 1. */
981 426 : static inline int tcp_skb_mss(const struct sk_buff *skb)
982 : {
983 426 : return TCP_SKB_CB(skb)->tcp_gso_size;
984 : }
985 :
986 50 : static inline bool tcp_skb_can_collapse_to(const struct sk_buff *skb)
987 : {
988 50 : return likely(!TCP_SKB_CB(skb)->eor);
989 : }
990 :
991 0 : static inline bool tcp_skb_can_collapse(const struct sk_buff *to,
992 : const struct sk_buff *from)
993 : {
994 0 : return likely(tcp_skb_can_collapse_to(to) &&
995 : mptcp_skb_can_collapse(to, from));
996 : }
997 :
998 : /* Events passed to congestion control interface */
999 : enum tcp_ca_event {
1000 : CA_EVENT_TX_START, /* first transmit when no packets in flight */
1001 : CA_EVENT_CWND_RESTART, /* congestion window restart */
1002 : CA_EVENT_COMPLETE_CWR, /* end of congestion recovery */
1003 : CA_EVENT_LOSS, /* loss timeout */
1004 : CA_EVENT_ECN_NO_CE, /* ECT set, but not CE marked */
1005 : CA_EVENT_ECN_IS_CE, /* received CE marked IP packet */
1006 : };
1007 :
1008 : /* Information about inbound ACK, passed to cong_ops->in_ack_event() */
1009 : enum tcp_ca_ack_event_flags {
1010 : CA_ACK_SLOWPATH = (1 << 0), /* In slow path processing */
1011 : CA_ACK_WIN_UPDATE = (1 << 1), /* ACK updated window */
1012 : CA_ACK_ECE = (1 << 2), /* ECE bit is set on ack */
1013 : };
1014 :
1015 : /*
1016 : * Interface for adding new TCP congestion control handlers
1017 : */
1018 : #define TCP_CA_NAME_MAX 16
1019 : #define TCP_CA_MAX 128
1020 : #define TCP_CA_BUF_MAX (TCP_CA_NAME_MAX*TCP_CA_MAX)
1021 :
1022 : #define TCP_CA_UNSPEC 0
1023 :
1024 : /* Algorithm can be set on socket without CAP_NET_ADMIN privileges */
1025 : #define TCP_CONG_NON_RESTRICTED 0x1
1026 : /* Requires ECN/ECT set on all packets */
1027 : #define TCP_CONG_NEEDS_ECN 0x2
1028 : #define TCP_CONG_MASK (TCP_CONG_NON_RESTRICTED | TCP_CONG_NEEDS_ECN)
1029 :
1030 : union tcp_cc_info;
1031 :
1032 : struct ack_sample {
1033 : u32 pkts_acked;
1034 : s32 rtt_us;
1035 : u32 in_flight;
1036 : };
1037 :
1038 : /* A rate sample measures the number of (original/retransmitted) data
1039 : * packets delivered "delivered" over an interval of time "interval_us".
1040 : * The tcp_rate.c code fills in the rate sample, and congestion
1041 : * control modules that define a cong_control function to run at the end
1042 : * of ACK processing can optionally chose to consult this sample when
1043 : * setting cwnd and pacing rate.
1044 : * A sample is invalid if "delivered" or "interval_us" is negative.
1045 : */
1046 : struct rate_sample {
1047 : u64 prior_mstamp; /* starting timestamp for interval */
1048 : u32 prior_delivered; /* tp->delivered at "prior_mstamp" */
1049 : s32 delivered; /* number of packets delivered over interval */
1050 : long interval_us; /* time for tp->delivered to incr "delivered" */
1051 : u32 snd_interval_us; /* snd interval for delivered packets */
1052 : u32 rcv_interval_us; /* rcv interval for delivered packets */
1053 : long rtt_us; /* RTT of last (S)ACKed packet (or -1) */
1054 : int losses; /* number of packets marked lost upon ACK */
1055 : u32 acked_sacked; /* number of packets newly (S)ACKed upon ACK */
1056 : u32 prior_in_flight; /* in flight before this ACK */
1057 : bool is_app_limited; /* is sample from packet with bubble in pipe? */
1058 : bool is_retrans; /* is sample from retransmission? */
1059 : bool is_ack_delayed; /* is this (likely) a delayed ACK? */
1060 : };
1061 :
1062 : struct tcp_congestion_ops {
1063 : struct list_head list;
1064 : u32 key;
1065 : u32 flags;
1066 :
1067 : /* initialize private data (optional) */
1068 : void (*init)(struct sock *sk);
1069 : /* cleanup private data (optional) */
1070 : void (*release)(struct sock *sk);
1071 :
1072 : /* return slow start threshold (required) */
1073 : u32 (*ssthresh)(struct sock *sk);
1074 : /* do new cwnd calculation (required) */
1075 : void (*cong_avoid)(struct sock *sk, u32 ack, u32 acked);
1076 : /* call before changing ca_state (optional) */
1077 : void (*set_state)(struct sock *sk, u8 new_state);
1078 : /* call when cwnd event occurs (optional) */
1079 : void (*cwnd_event)(struct sock *sk, enum tcp_ca_event ev);
1080 : /* call when ack arrives (optional) */
1081 : void (*in_ack_event)(struct sock *sk, u32 flags);
1082 : /* new value of cwnd after loss (required) */
1083 : u32 (*undo_cwnd)(struct sock *sk);
1084 : /* hook for packet ack accounting (optional) */
1085 : void (*pkts_acked)(struct sock *sk, const struct ack_sample *sample);
1086 : /* override sysctl_tcp_min_tso_segs */
1087 : u32 (*min_tso_segs)(struct sock *sk);
1088 : /* returns the multiplier used in tcp_sndbuf_expand (optional) */
1089 : u32 (*sndbuf_expand)(struct sock *sk);
1090 : /* call when packets are delivered to update cwnd and pacing rate,
1091 : * after all the ca_state processing. (optional)
1092 : */
1093 : void (*cong_control)(struct sock *sk, const struct rate_sample *rs);
1094 : /* get info for inet_diag (optional) */
1095 : size_t (*get_info)(struct sock *sk, u32 ext, int *attr,
1096 : union tcp_cc_info *info);
1097 :
1098 : char name[TCP_CA_NAME_MAX];
1099 : struct module *owner;
1100 : };
1101 :
1102 : int tcp_register_congestion_control(struct tcp_congestion_ops *type);
1103 : void tcp_unregister_congestion_control(struct tcp_congestion_ops *type);
1104 :
1105 : void tcp_assign_congestion_control(struct sock *sk);
1106 : void tcp_init_congestion_control(struct sock *sk);
1107 : void tcp_cleanup_congestion_control(struct sock *sk);
1108 : int tcp_set_default_congestion_control(struct net *net, const char *name);
1109 : void tcp_get_default_congestion_control(struct net *net, char *name);
1110 : void tcp_get_available_congestion_control(char *buf, size_t len);
1111 : void tcp_get_allowed_congestion_control(char *buf, size_t len);
1112 : int tcp_set_allowed_congestion_control(char *allowed);
1113 : int tcp_set_congestion_control(struct sock *sk, const char *name, bool load,
1114 : bool cap_net_admin);
1115 : u32 tcp_slow_start(struct tcp_sock *tp, u32 acked);
1116 : void tcp_cong_avoid_ai(struct tcp_sock *tp, u32 w, u32 acked);
1117 :
1118 : u32 tcp_reno_ssthresh(struct sock *sk);
1119 : u32 tcp_reno_undo_cwnd(struct sock *sk);
1120 : void tcp_reno_cong_avoid(struct sock *sk, u32 ack, u32 acked);
1121 : extern struct tcp_congestion_ops tcp_reno;
1122 :
1123 : struct tcp_congestion_ops *tcp_ca_find(const char *name);
1124 : struct tcp_congestion_ops *tcp_ca_find_key(u32 key);
1125 : u32 tcp_ca_get_key_by_name(struct net *net, const char *name, bool *ecn_ca);
1126 : #ifdef CONFIG_INET
1127 : char *tcp_ca_get_name_by_key(u32 key, char *buffer);
1128 : #else
1129 : static inline char *tcp_ca_get_name_by_key(u32 key, char *buffer)
1130 : {
1131 : return NULL;
1132 : }
1133 : #endif
1134 :
1135 4 : static inline bool tcp_ca_needs_ecn(const struct sock *sk)
1136 : {
1137 4 : const struct inet_connection_sock *icsk = inet_csk(sk);
1138 :
1139 4 : return icsk->icsk_ca_ops->flags & TCP_CONG_NEEDS_ECN;
1140 : }
1141 :
1142 4 : static inline void tcp_set_ca_state(struct sock *sk, const u8 ca_state)
1143 : {
1144 4 : struct inet_connection_sock *icsk = inet_csk(sk);
1145 :
1146 4 : if (icsk->icsk_ca_ops->set_state)
1147 4 : icsk->icsk_ca_ops->set_state(sk, ca_state);
1148 4 : icsk->icsk_ca_state = ca_state;
1149 0 : }
1150 :
1151 235 : static inline void tcp_ca_event(struct sock *sk, const enum tcp_ca_event event)
1152 : {
1153 235 : const struct inet_connection_sock *icsk = inet_csk(sk);
1154 :
1155 235 : if (icsk->icsk_ca_ops->cwnd_event)
1156 235 : icsk->icsk_ca_ops->cwnd_event(sk, event);
1157 : }
1158 :
1159 : /* From tcp_rate.c */
1160 : void tcp_rate_skb_sent(struct sock *sk, struct sk_buff *skb);
1161 : void tcp_rate_skb_delivered(struct sock *sk, struct sk_buff *skb,
1162 : struct rate_sample *rs);
1163 : void tcp_rate_gen(struct sock *sk, u32 delivered, u32 lost,
1164 : bool is_sack_reneg, struct rate_sample *rs);
1165 : void tcp_rate_check_app_limited(struct sock *sk);
1166 :
1167 : /* These functions determine how the current flow behaves in respect of SACK
1168 : * handling. SACK is negotiated with the peer, and therefore it can vary
1169 : * between different flows.
1170 : *
1171 : * tcp_is_sack - SACK enabled
1172 : * tcp_is_reno - No SACK
1173 : */
1174 1803 : static inline int tcp_is_sack(const struct tcp_sock *tp)
1175 : {
1176 1452 : return likely(tp->rx_opt.sack_ok);
1177 : }
1178 :
1179 351 : static inline bool tcp_is_reno(const struct tcp_sock *tp)
1180 : {
1181 351 : return !tcp_is_sack(tp);
1182 : }
1183 :
1184 2409 : static inline unsigned int tcp_left_out(const struct tcp_sock *tp)
1185 : {
1186 893 : return tp->sacked_out + tp->lost_out;
1187 : }
1188 :
1189 : /* This determines how many packets are "in the network" to the best
1190 : * of our knowledge. In many cases it is conservative, but where
1191 : * detailed information is available from the receiver (via SACK
1192 : * blocks etc.) we can make more aggressive calculations.
1193 : *
1194 : * Use this for decisions involving congestion control, use just
1195 : * tp->packets_out to determine if the send queue is empty or not.
1196 : *
1197 : * Read this equation as:
1198 : *
1199 : * "Packets sent once on transmission queue" MINUS
1200 : * "Packets left network, but not honestly ACKed yet" PLUS
1201 : * "Packets fast retransmitted"
1202 : */
1203 2058 : static inline unsigned int tcp_packets_in_flight(const struct tcp_sock *tp)
1204 : {
1205 2058 : return tp->packets_out - tcp_left_out(tp) + tp->retrans_out;
1206 : }
1207 :
1208 : #define TCP_INFINITE_SSTHRESH 0x7fffffff
1209 :
1210 1066 : static inline bool tcp_in_slow_start(const struct tcp_sock *tp)
1211 : {
1212 715 : return tp->snd_cwnd < tp->snd_ssthresh;
1213 : }
1214 :
1215 3 : static inline bool tcp_in_initial_slowstart(const struct tcp_sock *tp)
1216 : {
1217 3 : return tp->snd_ssthresh >= TCP_INFINITE_SSTHRESH;
1218 : }
1219 :
1220 719 : static inline bool tcp_in_cwnd_reduction(const struct sock *sk)
1221 : {
1222 715 : return (TCPF_CA_CWR | TCPF_CA_Recovery) &
1223 719 : (1 << inet_csk(sk)->icsk_ca_state);
1224 : }
1225 :
1226 : /* If cwnd > ssthresh, we may raise ssthresh to be half-way to cwnd.
1227 : * The exception is cwnd reduction phase, when cwnd is decreasing towards
1228 : * ssthresh.
1229 : */
1230 4 : static inline __u32 tcp_current_ssthresh(const struct sock *sk)
1231 : {
1232 4 : const struct tcp_sock *tp = tcp_sk(sk);
1233 :
1234 4 : if (tcp_in_cwnd_reduction(sk))
1235 0 : return tp->snd_ssthresh;
1236 : else
1237 4 : return max(tp->snd_ssthresh,
1238 : ((tp->snd_cwnd >> 1) +
1239 : (tp->snd_cwnd >> 2)));
1240 : }
1241 :
1242 : /* Use define here intentionally to get WARN_ON location shown at the caller */
1243 : #define tcp_verify_left_out(tp) WARN_ON(tcp_left_out(tp) > tp->packets_out)
1244 :
1245 : void tcp_enter_cwr(struct sock *sk);
1246 : __u32 tcp_init_cwnd(const struct tcp_sock *tp, const struct dst_entry *dst);
1247 :
1248 : /* The maximum number of MSS of available cwnd for which TSO defers
1249 : * sending if not using sysctl_tcp_tso_win_divisor.
1250 : */
1251 0 : static inline __u32 tcp_max_tso_deferred_mss(const struct tcp_sock *tp)
1252 : {
1253 0 : return 3;
1254 : }
1255 :
1256 : /* Returns end sequence number of the receiver's advertised window */
1257 428 : static inline u32 tcp_wnd_end(const struct tcp_sock *tp)
1258 : {
1259 428 : return tp->snd_una + tp->snd_wnd;
1260 : }
1261 :
1262 : /* We follow the spirit of RFC2861 to validate cwnd but implement a more
1263 : * flexible approach. The RFC suggests cwnd should not be raised unless
1264 : * it was fully used previously. And that's exactly what we do in
1265 : * congestion avoidance mode. But in slow start we allow cwnd to grow
1266 : * as long as the application has used half the cwnd.
1267 : * Example :
1268 : * cwnd is 10 (IW10), but application sends 9 frames.
1269 : * We allow cwnd to reach 18 when all frames are ACKed.
1270 : * This check is safe because it's as aggressive as slow start which already
1271 : * risks 100% overshoot. The advantage is that we discourage application to
1272 : * either send more filler packets or data to artificially blow up the cwnd
1273 : * usage, and allow application-limited process to probe bw more aggressively.
1274 : */
1275 715 : static inline bool tcp_is_cwnd_limited(const struct sock *sk)
1276 : {
1277 715 : const struct tcp_sock *tp = tcp_sk(sk);
1278 :
1279 : /* If in slow start, ensure cwnd grows to twice what was ACKed. */
1280 715 : if (tcp_in_slow_start(tp))
1281 715 : return tp->snd_cwnd < 2 * tp->max_packets_out;
1282 :
1283 0 : return tp->is_cwnd_limited;
1284 : }
1285 :
1286 : /* BBR congestion control needs pacing.
1287 : * Same remark for SO_MAX_PACING_RATE.
1288 : * sch_fq packet scheduler is efficiently handling pacing,
1289 : * but is not always installed/used.
1290 : * Return true if TCP stack should pace packets itself.
1291 : */
1292 366 : static inline bool tcp_needs_internal_pacing(const struct sock *sk)
1293 : {
1294 366 : return smp_load_acquire(&sk->sk_pacing_status) == SK_PACING_NEEDED;
1295 : }
1296 :
1297 : /* Estimates in how many jiffies next packet for this flow can be sent.
1298 : * Scheduling a retransmit timer too early would be silly.
1299 : */
1300 351 : static inline unsigned long tcp_pacing_delay(const struct sock *sk)
1301 : {
1302 351 : s64 delay = tcp_sk(sk)->tcp_wstamp_ns - tcp_sk(sk)->tcp_clock_cache;
1303 :
1304 351 : return delay > 0 ? nsecs_to_jiffies(delay) : 0;
1305 : }
1306 :
1307 351 : static inline void tcp_reset_xmit_timer(struct sock *sk,
1308 : const int what,
1309 : unsigned long when,
1310 : const unsigned long max_when)
1311 : {
1312 351 : inet_csk_reset_xmit_timer(sk, what, when + tcp_pacing_delay(sk),
1313 : max_when);
1314 351 : }
1315 :
1316 : /* Something is really bad, we could not queue an additional packet,
1317 : * because qdisc is full or receiver sent a 0 window, or we are paced.
1318 : * We do not want to add fuel to the fire, or abort too early,
1319 : * so make sure the timer we arm now is at least 200ms in the future,
1320 : * regardless of current icsk_rto value (as it could be ~2ms)
1321 : */
1322 0 : static inline unsigned long tcp_probe0_base(const struct sock *sk)
1323 : {
1324 0 : return max_t(unsigned long, inet_csk(sk)->icsk_rto, TCP_RTO_MIN);
1325 : }
1326 :
1327 : /* Variant of inet_csk_rto_backoff() used for zero window probes */
1328 0 : static inline unsigned long tcp_probe0_when(const struct sock *sk,
1329 : unsigned long max_when)
1330 : {
1331 0 : u8 backoff = min_t(u8, ilog2(TCP_RTO_MAX / TCP_RTO_MIN) + 1,
1332 : inet_csk(sk)->icsk_backoff);
1333 0 : u64 when = (u64)tcp_probe0_base(sk) << backoff;
1334 :
1335 0 : return (unsigned long)min_t(u64, when, max_when);
1336 : }
1337 :
1338 0 : static inline void tcp_check_probe_timer(struct sock *sk)
1339 : {
1340 0 : if (!tcp_sk(sk)->packets_out && !inet_csk(sk)->icsk_pending)
1341 0 : tcp_reset_xmit_timer(sk, ICSK_TIME_PROBE0,
1342 : tcp_probe0_base(sk), TCP_RTO_MAX);
1343 0 : }
1344 :
1345 8 : static inline void tcp_init_wl(struct tcp_sock *tp, u32 seq)
1346 : {
1347 8 : tp->snd_wl1 = seq;
1348 : }
1349 :
1350 414 : static inline void tcp_update_wl(struct tcp_sock *tp, u32 seq)
1351 : {
1352 365 : tp->snd_wl1 = seq;
1353 49 : }
1354 :
1355 : /*
1356 : * Calculate(/check) TCP checksum
1357 : */
1358 445 : static inline __sum16 tcp_v4_check(int len, __be32 saddr,
1359 : __be32 daddr, __wsum base)
1360 : {
1361 445 : return csum_tcpudp_magic(saddr, daddr, len, IPPROTO_TCP, base);
1362 : }
1363 :
1364 238 : static inline bool tcp_checksum_complete(struct sk_buff *skb)
1365 : {
1366 238 : return !skb_csum_unnecessary(skb) &&
1367 0 : __skb_checksum_complete(skb);
1368 : }
1369 :
1370 : bool tcp_add_backlog(struct sock *sk, struct sk_buff *skb);
1371 : int tcp_filter(struct sock *sk, struct sk_buff *skb);
1372 : void tcp_set_state(struct sock *sk, int state);
1373 : void tcp_done(struct sock *sk);
1374 : int tcp_abort(struct sock *sk, int err);
1375 :
1376 0 : static inline void tcp_sack_reset(struct tcp_options_received *rx_opt)
1377 : {
1378 0 : rx_opt->dsack = 0;
1379 0 : rx_opt->num_sacks = 0;
1380 0 : }
1381 :
1382 : void tcp_cwnd_restart(struct sock *sk, s32 delta);
1383 :
1384 361 : static inline void tcp_slow_start_after_idle_check(struct sock *sk)
1385 : {
1386 361 : const struct tcp_congestion_ops *ca_ops = inet_csk(sk)->icsk_ca_ops;
1387 361 : struct tcp_sock *tp = tcp_sk(sk);
1388 361 : s32 delta;
1389 :
1390 361 : if (!sock_net(sk)->ipv4.sysctl_tcp_slow_start_after_idle || tp->packets_out ||
1391 217 : ca_ops->cong_control)
1392 : return;
1393 217 : delta = tcp_jiffies32 - tp->lsndtime;
1394 217 : if (delta > inet_csk(sk)->icsk_rto)
1395 4 : tcp_cwnd_restart(sk, delta);
1396 : }
1397 :
1398 : /* Determine a window scaling and initial window to offer. */
1399 : void tcp_select_initial_window(const struct sock *sk, int __space,
1400 : __u32 mss, __u32 *rcv_wnd,
1401 : __u32 *window_clamp, int wscale_ok,
1402 : __u8 *rcv_wscale, __u32 init_rcv_wnd);
1403 :
1404 988 : static inline int tcp_win_from_space(const struct sock *sk, int space)
1405 : {
1406 951 : int tcp_adv_win_scale = sock_net(sk)->ipv4.sysctl_tcp_adv_win_scale;
1407 :
1408 988 : return tcp_adv_win_scale <= 0 ?
1409 988 : (space>>(-tcp_adv_win_scale)) :
1410 988 : space - (space>>tcp_adv_win_scale);
1411 : }
1412 :
1413 : /* Note: caller must be prepared to deal with negative returns */
1414 497 : static inline int tcp_space(const struct sock *sk)
1415 : {
1416 1491 : return tcp_win_from_space(sk, READ_ONCE(sk->sk_rcvbuf) -
1417 497 : READ_ONCE(sk->sk_backlog.len) -
1418 497 : atomic_read(&sk->sk_rmem_alloc));
1419 : }
1420 :
1421 472 : static inline int tcp_full_space(const struct sock *sk)
1422 : {
1423 944 : return tcp_win_from_space(sk, READ_ONCE(sk->sk_rcvbuf));
1424 : }
1425 :
1426 : void tcp_cleanup_rbuf(struct sock *sk, int copied);
1427 :
1428 : /* We provision sk_rcvbuf around 200% of sk_rcvlowat.
1429 : * If 87.5 % (7/8) of the space has been consumed, we want to override
1430 : * SO_RCVLOWAT constraint, since we are receiving skbs with too small
1431 : * len/truesize ratio.
1432 : */
1433 0 : static inline bool tcp_rmem_pressure(const struct sock *sk)
1434 : {
1435 0 : int rcvbuf, threshold;
1436 :
1437 0 : if (tcp_under_memory_pressure(sk))
1438 : return true;
1439 :
1440 0 : rcvbuf = READ_ONCE(sk->sk_rcvbuf);
1441 0 : threshold = rcvbuf - (rcvbuf >> 3);
1442 :
1443 0 : return atomic_read(&sk->sk_rmem_alloc) > threshold;
1444 : }
1445 :
1446 755 : static inline bool tcp_epollin_ready(const struct sock *sk, int target)
1447 : {
1448 755 : const struct tcp_sock *tp = tcp_sk(sk);
1449 755 : int avail = READ_ONCE(tp->rcv_nxt) - READ_ONCE(tp->copied_seq);
1450 :
1451 755 : if (avail <= 0)
1452 : return false;
1453 :
1454 143 : return (avail >= target) || tcp_rmem_pressure(sk) ||
1455 0 : (tcp_receive_window(tp) <= inet_csk(sk)->icsk_ack.rcv_mss);
1456 : }
1457 :
1458 : extern void tcp_openreq_init_rwin(struct request_sock *req,
1459 : const struct sock *sk_listener,
1460 : const struct dst_entry *dst);
1461 :
1462 : void tcp_enter_memory_pressure(struct sock *sk);
1463 : void tcp_leave_memory_pressure(struct sock *sk);
1464 :
1465 0 : static inline int keepalive_intvl_when(const struct tcp_sock *tp)
1466 : {
1467 0 : struct net *net = sock_net((struct sock *)tp);
1468 :
1469 0 : return tp->keepalive_intvl ? : net->ipv4.sysctl_tcp_keepalive_intvl;
1470 : }
1471 :
1472 4 : static inline int keepalive_time_when(const struct tcp_sock *tp)
1473 : {
1474 4 : struct net *net = sock_net((struct sock *)tp);
1475 :
1476 4 : return tp->keepalive_time ? : net->ipv4.sysctl_tcp_keepalive_time;
1477 : }
1478 :
1479 0 : static inline int keepalive_probes(const struct tcp_sock *tp)
1480 : {
1481 0 : struct net *net = sock_net((struct sock *)tp);
1482 :
1483 0 : return tp->keepalive_probes ? : net->ipv4.sysctl_tcp_keepalive_probes;
1484 : }
1485 :
1486 0 : static inline u32 keepalive_time_elapsed(const struct tcp_sock *tp)
1487 : {
1488 0 : const struct inet_connection_sock *icsk = &tp->inet_conn;
1489 :
1490 0 : return min_t(u32, tcp_jiffies32 - icsk->icsk_ack.lrcvtime,
1491 : tcp_jiffies32 - tp->rcv_tstamp);
1492 : }
1493 :
1494 0 : static inline int tcp_fin_time(const struct sock *sk)
1495 : {
1496 0 : int fin_timeout = tcp_sk(sk)->linger2 ? : sock_net(sk)->ipv4.sysctl_tcp_fin_timeout;
1497 0 : const int rto = inet_csk(sk)->icsk_rto;
1498 :
1499 0 : if (fin_timeout < (rto << 2) - (rto >> 1))
1500 : fin_timeout = (rto << 2) - (rto >> 1);
1501 :
1502 0 : return fin_timeout;
1503 : }
1504 :
1505 0 : static inline bool tcp_paws_check(const struct tcp_options_received *rx_opt,
1506 : int paws_win)
1507 : {
1508 0 : if ((s32)(rx_opt->ts_recent - rx_opt->rcv_tsval) <= paws_win)
1509 : return true;
1510 0 : if (unlikely(!time_before32(ktime_get_seconds(),
1511 : rx_opt->ts_recent_stamp + TCP_PAWS_24DAYS)))
1512 : return true;
1513 : /*
1514 : * Some OSes send SYN and SYNACK messages with tsval=0 tsecr=0,
1515 : * then following tcp messages have valid values. Ignore 0 value,
1516 : * or else 'negative' tsval might forbid us to accept their packets.
1517 : */
1518 0 : if (!rx_opt->ts_recent)
1519 0 : return true;
1520 : return false;
1521 : }
1522 :
1523 0 : static inline bool tcp_paws_reject(const struct tcp_options_received *rx_opt,
1524 : int rst)
1525 : {
1526 0 : if (tcp_paws_check(rx_opt, 0))
1527 : return false;
1528 :
1529 : /* RST segments are not recommended to carry timestamp,
1530 : and, if they do, it is recommended to ignore PAWS because
1531 : "their cleanup function should take precedence over timestamps."
1532 : Certainly, it is mistake. It is necessary to understand the reasons
1533 : of this constraint to relax it: if peer reboots, clock may go
1534 : out-of-sync and half-open connections will not be reset.
1535 : Actually, the problem would be not existing if all
1536 : the implementations followed draft about maintaining clock
1537 : via reboots. Linux-2.2 DOES NOT!
1538 :
1539 : However, we can relax time bounds for RST segments to MSL.
1540 : */
1541 0 : if (rst && !time_before32(ktime_get_seconds(),
1542 : rx_opt->ts_recent_stamp + TCP_PAWS_MSL))
1543 0 : return false;
1544 : return true;
1545 : }
1546 :
1547 : bool tcp_oow_rate_limited(struct net *net, const struct sk_buff *skb,
1548 : int mib_idx, u32 *last_oow_ack_time);
1549 :
1550 1 : static inline void tcp_mib_init(struct net *net)
1551 : {
1552 : /* See RFC 2012 */
1553 1 : TCP_ADD_STATS(net, TCP_MIB_RTOALGORITHM, 1);
1554 1 : TCP_ADD_STATS(net, TCP_MIB_RTOMIN, TCP_RTO_MIN*1000/HZ);
1555 1 : TCP_ADD_STATS(net, TCP_MIB_RTOMAX, TCP_RTO_MAX*1000/HZ);
1556 1 : TCP_ADD_STATS(net, TCP_MIB_MAXCONN, -1);
1557 : }
1558 :
1559 : /* from STCP */
1560 4 : static inline void tcp_clear_retrans_hints_partial(struct tcp_sock *tp)
1561 : {
1562 4 : tp->lost_skb_hint = NULL;
1563 : }
1564 :
1565 4 : static inline void tcp_clear_all_retrans_hints(struct tcp_sock *tp)
1566 : {
1567 4 : tcp_clear_retrans_hints_partial(tp);
1568 4 : tp->retransmit_skb_hint = NULL;
1569 0 : }
1570 :
1571 : union tcp_md5_addr {
1572 : struct in_addr a4;
1573 : #if IS_ENABLED(CONFIG_IPV6)
1574 : struct in6_addr a6;
1575 : #endif
1576 : };
1577 :
1578 : /* - key database */
1579 : struct tcp_md5sig_key {
1580 : struct hlist_node node;
1581 : u8 keylen;
1582 : u8 family; /* AF_INET or AF_INET6 */
1583 : u8 prefixlen;
1584 : union tcp_md5_addr addr;
1585 : int l3index; /* set if key added with L3 scope */
1586 : u8 key[TCP_MD5SIG_MAXKEYLEN];
1587 : struct rcu_head rcu;
1588 : };
1589 :
1590 : /* - sock block */
1591 : struct tcp_md5sig_info {
1592 : struct hlist_head head;
1593 : struct rcu_head rcu;
1594 : };
1595 :
1596 : /* - pseudo header */
1597 : struct tcp4_pseudohdr {
1598 : __be32 saddr;
1599 : __be32 daddr;
1600 : __u8 pad;
1601 : __u8 protocol;
1602 : __be16 len;
1603 : };
1604 :
1605 : struct tcp6_pseudohdr {
1606 : struct in6_addr saddr;
1607 : struct in6_addr daddr;
1608 : __be32 len;
1609 : __be32 protocol; /* including padding */
1610 : };
1611 :
1612 : union tcp_md5sum_block {
1613 : struct tcp4_pseudohdr ip4;
1614 : #if IS_ENABLED(CONFIG_IPV6)
1615 : struct tcp6_pseudohdr ip6;
1616 : #endif
1617 : };
1618 :
1619 : /* - pool: digest algorithm, hash description and scratch buffer */
1620 : struct tcp_md5sig_pool {
1621 : struct ahash_request *md5_req;
1622 : void *scratch;
1623 : };
1624 :
1625 : /* - functions */
1626 : int tcp_v4_md5_hash_skb(char *md5_hash, const struct tcp_md5sig_key *key,
1627 : const struct sock *sk, const struct sk_buff *skb);
1628 : int tcp_md5_do_add(struct sock *sk, const union tcp_md5_addr *addr,
1629 : int family, u8 prefixlen, int l3index,
1630 : const u8 *newkey, u8 newkeylen, gfp_t gfp);
1631 : int tcp_md5_do_del(struct sock *sk, const union tcp_md5_addr *addr,
1632 : int family, u8 prefixlen, int l3index);
1633 : struct tcp_md5sig_key *tcp_v4_md5_lookup(const struct sock *sk,
1634 : const struct sock *addr_sk);
1635 :
1636 : #ifdef CONFIG_TCP_MD5SIG
1637 : #include <linux/jump_label.h>
1638 : extern struct static_key_false tcp_md5_needed;
1639 : struct tcp_md5sig_key *__tcp_md5_do_lookup(const struct sock *sk, int l3index,
1640 : const union tcp_md5_addr *addr,
1641 : int family);
1642 : static inline struct tcp_md5sig_key *
1643 : tcp_md5_do_lookup(const struct sock *sk, int l3index,
1644 : const union tcp_md5_addr *addr, int family)
1645 : {
1646 : if (!static_branch_unlikely(&tcp_md5_needed))
1647 : return NULL;
1648 : return __tcp_md5_do_lookup(sk, l3index, addr, family);
1649 : }
1650 :
1651 : #define tcp_twsk_md5_key(twsk) ((twsk)->tw_md5_key)
1652 : #else
1653 : static inline struct tcp_md5sig_key *
1654 0 : tcp_md5_do_lookup(const struct sock *sk, int l3index,
1655 : const union tcp_md5_addr *addr, int family)
1656 : {
1657 0 : return NULL;
1658 : }
1659 : #define tcp_twsk_md5_key(twsk) NULL
1660 : #endif
1661 :
1662 : bool tcp_alloc_md5sig_pool(void);
1663 :
1664 : struct tcp_md5sig_pool *tcp_get_md5sig_pool(void);
1665 : static inline void tcp_put_md5sig_pool(void)
1666 : {
1667 : local_bh_enable();
1668 : }
1669 :
1670 : int tcp_md5_hash_skb_data(struct tcp_md5sig_pool *, const struct sk_buff *,
1671 : unsigned int header_len);
1672 : int tcp_md5_hash_key(struct tcp_md5sig_pool *hp,
1673 : const struct tcp_md5sig_key *key);
1674 :
1675 : /* From tcp_fastopen.c */
1676 : void tcp_fastopen_cache_get(struct sock *sk, u16 *mss,
1677 : struct tcp_fastopen_cookie *cookie);
1678 : void tcp_fastopen_cache_set(struct sock *sk, u16 mss,
1679 : struct tcp_fastopen_cookie *cookie, bool syn_lost,
1680 : u16 try_exp);
1681 : struct tcp_fastopen_request {
1682 : /* Fast Open cookie. Size 0 means a cookie request */
1683 : struct tcp_fastopen_cookie cookie;
1684 : struct msghdr *data; /* data in MSG_FASTOPEN */
1685 : size_t size;
1686 : int copied; /* queued in tcp_connect() */
1687 : struct ubuf_info *uarg;
1688 : };
1689 : void tcp_free_fastopen_req(struct tcp_sock *tp);
1690 : void tcp_fastopen_destroy_cipher(struct sock *sk);
1691 : void tcp_fastopen_ctx_destroy(struct net *net);
1692 : int tcp_fastopen_reset_cipher(struct net *net, struct sock *sk,
1693 : void *primary_key, void *backup_key);
1694 : int tcp_fastopen_get_cipher(struct net *net, struct inet_connection_sock *icsk,
1695 : u64 *key);
1696 : void tcp_fastopen_add_skb(struct sock *sk, struct sk_buff *skb);
1697 : struct sock *tcp_try_fastopen(struct sock *sk, struct sk_buff *skb,
1698 : struct request_sock *req,
1699 : struct tcp_fastopen_cookie *foc,
1700 : const struct dst_entry *dst);
1701 : void tcp_fastopen_init_key_once(struct net *net);
1702 : bool tcp_fastopen_cookie_check(struct sock *sk, u16 *mss,
1703 : struct tcp_fastopen_cookie *cookie);
1704 : bool tcp_fastopen_defer_connect(struct sock *sk, int *err);
1705 : #define TCP_FASTOPEN_KEY_LENGTH sizeof(siphash_key_t)
1706 : #define TCP_FASTOPEN_KEY_MAX 2
1707 : #define TCP_FASTOPEN_KEY_BUF_LENGTH \
1708 : (TCP_FASTOPEN_KEY_LENGTH * TCP_FASTOPEN_KEY_MAX)
1709 :
1710 : /* Fastopen key context */
1711 : struct tcp_fastopen_context {
1712 : siphash_key_t key[TCP_FASTOPEN_KEY_MAX];
1713 : int num;
1714 : struct rcu_head rcu;
1715 : };
1716 :
1717 : extern unsigned int sysctl_tcp_fastopen_blackhole_timeout;
1718 : void tcp_fastopen_active_disable(struct sock *sk);
1719 : bool tcp_fastopen_active_should_disable(struct sock *sk);
1720 : void tcp_fastopen_active_disable_ofo_check(struct sock *sk);
1721 : void tcp_fastopen_active_detect_blackhole(struct sock *sk, bool expired);
1722 :
1723 : /* Caller needs to wrap with rcu_read_(un)lock() */
1724 : static inline
1725 0 : struct tcp_fastopen_context *tcp_fastopen_get_ctx(const struct sock *sk)
1726 : {
1727 0 : struct tcp_fastopen_context *ctx;
1728 :
1729 0 : ctx = rcu_dereference(inet_csk(sk)->icsk_accept_queue.fastopenq.ctx);
1730 0 : if (!ctx)
1731 0 : ctx = rcu_dereference(sock_net(sk)->ipv4.tcp_fastopen_ctx);
1732 0 : return ctx;
1733 : }
1734 :
1735 : static inline
1736 0 : bool tcp_fastopen_cookie_match(const struct tcp_fastopen_cookie *foc,
1737 : const struct tcp_fastopen_cookie *orig)
1738 : {
1739 0 : if (orig->len == TCP_FASTOPEN_COOKIE_SIZE &&
1740 0 : orig->len == foc->len &&
1741 0 : !memcmp(orig->val, foc->val, foc->len))
1742 0 : return true;
1743 : return false;
1744 : }
1745 :
1746 : static inline
1747 0 : int tcp_fastopen_context_len(const struct tcp_fastopen_context *ctx)
1748 : {
1749 0 : return ctx->num;
1750 : }
1751 :
1752 : /* Latencies incurred by various limits for a sender. They are
1753 : * chronograph-like stats that are mutually exclusive.
1754 : */
1755 : enum tcp_chrono {
1756 : TCP_CHRONO_UNSPEC,
1757 : TCP_CHRONO_BUSY, /* Actively sending data (non-empty write queue) */
1758 : TCP_CHRONO_RWND_LIMITED, /* Stalled by insufficient receive window */
1759 : TCP_CHRONO_SNDBUF_LIMITED, /* Stalled by insufficient send buffer */
1760 : __TCP_CHRONO_MAX,
1761 : };
1762 :
1763 : void tcp_chrono_start(struct sock *sk, const enum tcp_chrono type);
1764 : void tcp_chrono_stop(struct sock *sk, const enum tcp_chrono type);
1765 :
1766 : /* This helper is needed, because skb->tcp_tsorted_anchor uses
1767 : * the same memory storage than skb->destructor/_skb_refdst
1768 : */
1769 364 : static inline void tcp_skb_tsorted_anchor_cleanup(struct sk_buff *skb)
1770 : {
1771 364 : skb->destructor = NULL;
1772 364 : skb->_skb_refdst = 0UL;
1773 : }
1774 :
1775 : #define tcp_skb_tsorted_save(skb) { \
1776 : unsigned long _save = skb->_skb_refdst; \
1777 : skb->_skb_refdst = 0UL;
1778 :
1779 : #define tcp_skb_tsorted_restore(skb) \
1780 : skb->_skb_refdst = _save; \
1781 : }
1782 :
1783 : void tcp_write_queue_purge(struct sock *sk);
1784 :
1785 0 : static inline struct sk_buff *tcp_rtx_queue_head(const struct sock *sk)
1786 : {
1787 0 : return skb_rb_first(&sk->tcp_rtx_queue);
1788 : }
1789 :
1790 0 : static inline struct sk_buff *tcp_rtx_queue_tail(const struct sock *sk)
1791 : {
1792 0 : return skb_rb_last(&sk->tcp_rtx_queue);
1793 : }
1794 :
1795 : static inline struct sk_buff *tcp_write_queue_head(const struct sock *sk)
1796 : {
1797 : return skb_peek(&sk->sk_write_queue);
1798 : }
1799 :
1800 1236 : static inline struct sk_buff *tcp_write_queue_tail(const struct sock *sk)
1801 : {
1802 1286 : return skb_peek_tail(&sk->sk_write_queue);
1803 : }
1804 :
1805 : #define tcp_for_write_queue_from_safe(skb, tmp, sk) \
1806 : skb_queue_walk_from_safe(&(sk)->sk_write_queue, skb, tmp)
1807 :
1808 1177 : static inline struct sk_buff *tcp_send_head(const struct sock *sk)
1809 : {
1810 807 : return skb_peek(&sk->sk_write_queue);
1811 : }
1812 :
1813 366 : static inline bool tcp_skb_is_last(const struct sock *sk,
1814 : const struct sk_buff *skb)
1815 : {
1816 366 : return skb_queue_is_last(&sk->sk_write_queue, skb);
1817 : }
1818 :
1819 : /**
1820 : * tcp_write_queue_empty - test if any payload (or FIN) is available in write queue
1821 : * @sk: socket
1822 : *
1823 : * Since the write queue can have a temporary empty skb in it,
1824 : * we must not use "return skb_queue_empty(&sk->sk_write_queue)"
1825 : */
1826 859 : static inline bool tcp_write_queue_empty(const struct sock *sk)
1827 : {
1828 859 : const struct tcp_sock *tp = tcp_sk(sk);
1829 :
1830 384 : return tp->write_seq == tp->snd_nxt;
1831 : }
1832 :
1833 1432 : static inline bool tcp_rtx_queue_empty(const struct sock *sk)
1834 : {
1835 411 : return RB_EMPTY_ROOT(&sk->tcp_rtx_queue);
1836 : }
1837 :
1838 1021 : static inline bool tcp_rtx_and_write_queues_empty(const struct sock *sk)
1839 : {
1840 1035 : return tcp_rtx_queue_empty(sk) && tcp_write_queue_empty(sk);
1841 : }
1842 :
1843 364 : static inline void tcp_add_write_queue_tail(struct sock *sk, struct sk_buff *skb)
1844 : {
1845 364 : __skb_queue_tail(&sk->sk_write_queue, skb);
1846 :
1847 : /* Queue it, remembering where we must start sending. */
1848 364 : if (sk->sk_write_queue.next == skb)
1849 364 : tcp_chrono_start(sk, TCP_CHRONO_BUSY);
1850 364 : }
1851 :
1852 : /* Insert new before skb on the write queue of sk. */
1853 0 : static inline void tcp_insert_write_queue_before(struct sk_buff *new,
1854 : struct sk_buff *skb,
1855 : struct sock *sk)
1856 : {
1857 0 : __skb_queue_before(&sk->sk_write_queue, skb, new);
1858 : }
1859 :
1860 0 : static inline void tcp_unlink_write_queue(struct sk_buff *skb, struct sock *sk)
1861 : {
1862 0 : tcp_skb_tsorted_anchor_cleanup(skb);
1863 0 : __skb_unlink(skb, &sk->sk_write_queue);
1864 : }
1865 :
1866 : void tcp_rbtree_insert(struct rb_root *root, struct sk_buff *skb);
1867 :
1868 364 : static inline void tcp_rtx_queue_unlink(struct sk_buff *skb, struct sock *sk)
1869 : {
1870 364 : tcp_skb_tsorted_anchor_cleanup(skb);
1871 364 : rb_erase(&skb->rbnode, &sk->tcp_rtx_queue);
1872 : }
1873 :
1874 364 : static inline void tcp_rtx_queue_unlink_and_free(struct sk_buff *skb, struct sock *sk)
1875 : {
1876 364 : list_del(&skb->tcp_tsorted_anchor);
1877 364 : tcp_rtx_queue_unlink(skb, sk);
1878 364 : sk_wmem_free_skb(sk, skb);
1879 364 : }
1880 :
1881 372 : static inline void tcp_push_pending_frames(struct sock *sk)
1882 : {
1883 372 : if (tcp_send_head(sk)) {
1884 24 : struct tcp_sock *tp = tcp_sk(sk);
1885 :
1886 24 : __tcp_push_pending_frames(sk, tcp_current_mss(sk), tp->nonagle);
1887 : }
1888 372 : }
1889 :
1890 : /* Start sequence of the skb just after the highest skb with SACKed
1891 : * bit, valid only if sacked_out > 0 or when the caller has ensured
1892 : * validity by itself.
1893 : */
1894 0 : static inline u32 tcp_highest_sack_seq(struct tcp_sock *tp)
1895 : {
1896 0 : if (!tp->sacked_out)
1897 0 : return tp->snd_una;
1898 :
1899 0 : if (tp->highest_sack == NULL)
1900 0 : return tp->snd_nxt;
1901 :
1902 0 : return TCP_SKB_CB(tp->highest_sack)->seq;
1903 : }
1904 :
1905 0 : static inline void tcp_advance_highest_sack(struct sock *sk, struct sk_buff *skb)
1906 : {
1907 0 : tcp_sk(sk)->highest_sack = skb_rb_next(skb);
1908 0 : }
1909 :
1910 364 : static inline struct sk_buff *tcp_highest_sack(struct sock *sk)
1911 : {
1912 0 : return tcp_sk(sk)->highest_sack;
1913 : }
1914 :
1915 0 : static inline void tcp_highest_sack_reset(struct sock *sk)
1916 : {
1917 0 : tcp_sk(sk)->highest_sack = tcp_rtx_queue_head(sk);
1918 0 : }
1919 :
1920 : /* Called when old skb is about to be deleted and replaced by new skb */
1921 364 : static inline void tcp_highest_sack_replace(struct sock *sk,
1922 : struct sk_buff *old,
1923 : struct sk_buff *new)
1924 : {
1925 364 : if (old == tcp_highest_sack(sk))
1926 364 : tcp_sk(sk)->highest_sack = new;
1927 : }
1928 :
1929 : /* This helper checks if socket has IP_TRANSPARENT set */
1930 0 : static inline bool inet_sk_transparent(const struct sock *sk)
1931 : {
1932 0 : switch (sk->sk_state) {
1933 : case TCP_TIME_WAIT:
1934 0 : return inet_twsk(sk)->tw_transparent;
1935 : case TCP_NEW_SYN_RECV:
1936 0 : return inet_rsk(inet_reqsk(sk))->no_srccheck;
1937 : }
1938 0 : return inet_sk(sk)->transparent;
1939 : }
1940 :
1941 : /* Determines whether this is a thin stream (which may suffer from
1942 : * increased latency). Used to trigger latency-reducing mechanisms.
1943 : */
1944 0 : static inline bool tcp_stream_is_thin(struct tcp_sock *tp)
1945 : {
1946 0 : return tp->packets_out < 4 && !tcp_in_initial_slowstart(tp);
1947 : }
1948 :
1949 : /* /proc */
1950 : enum tcp_seq_states {
1951 : TCP_SEQ_STATE_LISTENING,
1952 : TCP_SEQ_STATE_ESTABLISHED,
1953 : };
1954 :
1955 : void *tcp_seq_start(struct seq_file *seq, loff_t *pos);
1956 : void *tcp_seq_next(struct seq_file *seq, void *v, loff_t *pos);
1957 : void tcp_seq_stop(struct seq_file *seq, void *v);
1958 :
1959 : struct tcp_seq_afinfo {
1960 : sa_family_t family;
1961 : };
1962 :
1963 : struct tcp_iter_state {
1964 : struct seq_net_private p;
1965 : enum tcp_seq_states state;
1966 : struct sock *syn_wait_sk;
1967 : struct tcp_seq_afinfo *bpf_seq_afinfo;
1968 : int bucket, offset, sbucket, num;
1969 : loff_t last_pos;
1970 : };
1971 :
1972 : extern struct request_sock_ops tcp_request_sock_ops;
1973 : extern struct request_sock_ops tcp6_request_sock_ops;
1974 :
1975 : void tcp_v4_destroy_sock(struct sock *sk);
1976 :
1977 : struct sk_buff *tcp_gso_segment(struct sk_buff *skb,
1978 : netdev_features_t features);
1979 : struct sk_buff *tcp_gro_receive(struct list_head *head, struct sk_buff *skb);
1980 : INDIRECT_CALLABLE_DECLARE(int tcp4_gro_complete(struct sk_buff *skb, int thoff));
1981 : INDIRECT_CALLABLE_DECLARE(struct sk_buff *tcp4_gro_receive(struct list_head *head, struct sk_buff *skb));
1982 : INDIRECT_CALLABLE_DECLARE(int tcp6_gro_complete(struct sk_buff *skb, int thoff));
1983 : INDIRECT_CALLABLE_DECLARE(struct sk_buff *tcp6_gro_receive(struct list_head *head, struct sk_buff *skb));
1984 : int tcp_gro_complete(struct sk_buff *skb);
1985 :
1986 : void __tcp_v4_send_check(struct sk_buff *skb, __be32 saddr, __be32 daddr);
1987 :
1988 1046 : static inline u32 tcp_notsent_lowat(const struct tcp_sock *tp)
1989 : {
1990 1046 : struct net *net = sock_net((struct sock *)tp);
1991 1046 : return tp->notsent_lowat ?: net->ipv4.sysctl_tcp_notsent_lowat;
1992 : }
1993 :
1994 : bool tcp_stream_memory_free(const struct sock *sk, int wake);
1995 :
1996 : #ifdef CONFIG_PROC_FS
1997 : int tcp4_proc_init(void);
1998 : void tcp4_proc_exit(void);
1999 : #endif
2000 :
2001 : int tcp_rtx_synack(const struct sock *sk, struct request_sock *req);
2002 : int tcp_conn_request(struct request_sock_ops *rsk_ops,
2003 : const struct tcp_request_sock_ops *af_ops,
2004 : struct sock *sk, struct sk_buff *skb);
2005 :
2006 : /* TCP af-specific functions */
2007 : struct tcp_sock_af_ops {
2008 : #ifdef CONFIG_TCP_MD5SIG
2009 : struct tcp_md5sig_key *(*md5_lookup) (const struct sock *sk,
2010 : const struct sock *addr_sk);
2011 : int (*calc_md5_hash)(char *location,
2012 : const struct tcp_md5sig_key *md5,
2013 : const struct sock *sk,
2014 : const struct sk_buff *skb);
2015 : int (*md5_parse)(struct sock *sk,
2016 : int optname,
2017 : sockptr_t optval,
2018 : int optlen);
2019 : #endif
2020 : };
2021 :
2022 : struct tcp_request_sock_ops {
2023 : u16 mss_clamp;
2024 : #ifdef CONFIG_TCP_MD5SIG
2025 : struct tcp_md5sig_key *(*req_md5_lookup)(const struct sock *sk,
2026 : const struct sock *addr_sk);
2027 : int (*calc_md5_hash) (char *location,
2028 : const struct tcp_md5sig_key *md5,
2029 : const struct sock *sk,
2030 : const struct sk_buff *skb);
2031 : #endif
2032 : #ifdef CONFIG_SYN_COOKIES
2033 : __u32 (*cookie_init_seq)(const struct sk_buff *skb,
2034 : __u16 *mss);
2035 : #endif
2036 : struct dst_entry *(*route_req)(const struct sock *sk,
2037 : struct sk_buff *skb,
2038 : struct flowi *fl,
2039 : struct request_sock *req);
2040 : u32 (*init_seq)(const struct sk_buff *skb);
2041 : u32 (*init_ts_off)(const struct net *net, const struct sk_buff *skb);
2042 : int (*send_synack)(const struct sock *sk, struct dst_entry *dst,
2043 : struct flowi *fl, struct request_sock *req,
2044 : struct tcp_fastopen_cookie *foc,
2045 : enum tcp_synack_type synack_type,
2046 : struct sk_buff *syn_skb);
2047 : };
2048 :
2049 : extern const struct tcp_request_sock_ops tcp_request_sock_ipv4_ops;
2050 : #if IS_ENABLED(CONFIG_IPV6)
2051 : extern const struct tcp_request_sock_ops tcp_request_sock_ipv6_ops;
2052 : #endif
2053 :
2054 : #ifdef CONFIG_SYN_COOKIES
2055 : static inline __u32 cookie_init_sequence(const struct tcp_request_sock_ops *ops,
2056 : const struct sock *sk, struct sk_buff *skb,
2057 : __u16 *mss)
2058 : {
2059 : tcp_synq_overflow(sk);
2060 : __NET_INC_STATS(sock_net(sk), LINUX_MIB_SYNCOOKIESSENT);
2061 : return ops->cookie_init_seq(skb, mss);
2062 : }
2063 : #else
2064 0 : static inline __u32 cookie_init_sequence(const struct tcp_request_sock_ops *ops,
2065 : const struct sock *sk, struct sk_buff *skb,
2066 : __u16 *mss)
2067 : {
2068 0 : return 0;
2069 : }
2070 : #endif
2071 :
2072 : int tcpv4_offload_init(void);
2073 :
2074 : void tcp_v4_init(void);
2075 : void tcp_init(void);
2076 :
2077 : /* tcp_recovery.c */
2078 : void tcp_mark_skb_lost(struct sock *sk, struct sk_buff *skb);
2079 : void tcp_newreno_mark_lost(struct sock *sk, bool snd_una_advanced);
2080 : extern s32 tcp_rack_skb_timeout(struct tcp_sock *tp, struct sk_buff *skb,
2081 : u32 reo_wnd);
2082 : extern bool tcp_rack_mark_lost(struct sock *sk);
2083 : extern void tcp_rack_advance(struct tcp_sock *tp, u8 sacked, u32 end_seq,
2084 : u64 xmit_time);
2085 : extern void tcp_rack_reo_timeout(struct sock *sk);
2086 : extern void tcp_rack_update_reo_wnd(struct sock *sk, struct rate_sample *rs);
2087 :
2088 : /* At how many usecs into the future should the RTO fire? */
2089 0 : static inline s64 tcp_rto_delta_us(const struct sock *sk)
2090 : {
2091 0 : const struct sk_buff *skb = tcp_rtx_queue_head(sk);
2092 0 : u32 rto = inet_csk(sk)->icsk_rto;
2093 0 : u64 rto_time_stamp_us = tcp_skb_timestamp_us(skb) + jiffies_to_usecs(rto);
2094 :
2095 0 : return rto_time_stamp_us - tcp_sk(sk)->tcp_mstamp;
2096 : }
2097 :
2098 : /*
2099 : * Save and compile IPv4 options, return a pointer to it
2100 : */
2101 4 : static inline struct ip_options_rcu *tcp_v4_save_options(struct net *net,
2102 : struct sk_buff *skb)
2103 : {
2104 4 : const struct ip_options *opt = &TCP_SKB_CB(skb)->header.h4.opt;
2105 4 : struct ip_options_rcu *dopt = NULL;
2106 :
2107 4 : if (opt->optlen) {
2108 0 : int opt_size = sizeof(*dopt) + opt->optlen;
2109 :
2110 0 : dopt = kmalloc(opt_size, GFP_ATOMIC);
2111 0 : if (dopt && __ip_options_echo(net, &dopt->opt, skb, opt)) {
2112 0 : kfree(dopt);
2113 0 : dopt = NULL;
2114 : }
2115 : }
2116 4 : return dopt;
2117 : }
2118 :
2119 : /* locally generated TCP pure ACKs have skb->truesize == 2
2120 : * (check tcp_send_ack() in net/ipv4/tcp_output.c )
2121 : * This is much faster than dissecting the packet to find out.
2122 : * (Think of GRE encapsulations, IPv4, IPv6, ...)
2123 : */
2124 426 : static inline bool skb_is_tcp_pure_ack(const struct sk_buff *skb)
2125 : {
2126 426 : return skb->truesize == 2;
2127 : }
2128 :
2129 62 : static inline void skb_set_tcp_pure_ack(struct sk_buff *skb)
2130 : {
2131 62 : skb->truesize = 2;
2132 : }
2133 :
2134 0 : static inline int tcp_inq(struct sock *sk)
2135 : {
2136 0 : struct tcp_sock *tp = tcp_sk(sk);
2137 0 : int answ;
2138 :
2139 0 : if ((1 << sk->sk_state) & (TCPF_SYN_SENT | TCPF_SYN_RECV)) {
2140 : answ = 0;
2141 0 : } else if (sock_flag(sk, SOCK_URGINLINE) ||
2142 0 : !tp->urg_data ||
2143 0 : before(tp->urg_seq, tp->copied_seq) ||
2144 0 : !before(tp->urg_seq, tp->rcv_nxt)) {
2145 :
2146 0 : answ = tp->rcv_nxt - tp->copied_seq;
2147 :
2148 : /* Subtract 1, if FIN was received */
2149 0 : if (answ && sock_flag(sk, SOCK_DONE))
2150 0 : answ--;
2151 : } else {
2152 : answ = tp->urg_seq - tp->copied_seq;
2153 : }
2154 :
2155 0 : return answ;
2156 : }
2157 :
2158 : int tcp_peek_len(struct socket *sock);
2159 :
2160 434 : static inline void tcp_segs_in(struct tcp_sock *tp, const struct sk_buff *skb)
2161 : {
2162 434 : u16 segs_in;
2163 :
2164 434 : segs_in = max_t(u16, 1, skb_shinfo(skb)->gso_segs);
2165 434 : tp->segs_in += segs_in;
2166 434 : if (skb->len > tcp_hdrlen(skb))
2167 67 : tp->data_segs_in += segs_in;
2168 434 : }
2169 :
2170 : /*
2171 : * TCP listen path runs lockless.
2172 : * We forced "struct sock" to be const qualified to make sure
2173 : * we don't modify one of its field by mistake.
2174 : * Here, we increment sk_drops which is an atomic_t, so we can safely
2175 : * make sock writable again.
2176 : */
2177 0 : static inline void tcp_listendrop(const struct sock *sk)
2178 : {
2179 0 : atomic_inc(&((struct sock *)sk)->sk_drops);
2180 0 : __NET_INC_STATS(sock_net(sk), LINUX_MIB_LISTENDROPS);
2181 0 : }
2182 :
2183 : enum hrtimer_restart tcp_pace_kick(struct hrtimer *timer);
2184 :
2185 : /*
2186 : * Interface for adding Upper Level Protocols over TCP
2187 : */
2188 :
2189 : #define TCP_ULP_NAME_MAX 16
2190 : #define TCP_ULP_MAX 128
2191 : #define TCP_ULP_BUF_MAX (TCP_ULP_NAME_MAX*TCP_ULP_MAX)
2192 :
2193 : struct tcp_ulp_ops {
2194 : struct list_head list;
2195 :
2196 : /* initialize ulp */
2197 : int (*init)(struct sock *sk);
2198 : /* update ulp */
2199 : void (*update)(struct sock *sk, struct proto *p,
2200 : void (*write_space)(struct sock *sk));
2201 : /* cleanup ulp */
2202 : void (*release)(struct sock *sk);
2203 : /* diagnostic */
2204 : int (*get_info)(const struct sock *sk, struct sk_buff *skb);
2205 : size_t (*get_info_size)(const struct sock *sk);
2206 : /* clone ulp */
2207 : void (*clone)(const struct request_sock *req, struct sock *newsk,
2208 : const gfp_t priority);
2209 :
2210 : char name[TCP_ULP_NAME_MAX];
2211 : struct module *owner;
2212 : };
2213 : int tcp_register_ulp(struct tcp_ulp_ops *type);
2214 : void tcp_unregister_ulp(struct tcp_ulp_ops *type);
2215 : int tcp_set_ulp(struct sock *sk, const char *name);
2216 : void tcp_get_available_ulp(char *buf, size_t len);
2217 : void tcp_cleanup_ulp(struct sock *sk);
2218 : void tcp_update_ulp(struct sock *sk, struct proto *p,
2219 : void (*write_space)(struct sock *sk));
2220 :
2221 : #define MODULE_ALIAS_TCP_ULP(name) \
2222 : __MODULE_INFO(alias, alias_userspace, name); \
2223 : __MODULE_INFO(alias, alias_tcp_ulp, "tcp-ulp-" name)
2224 :
2225 : struct sk_msg;
2226 : struct sk_psock;
2227 :
2228 : #ifdef CONFIG_BPF_STREAM_PARSER
2229 : struct proto *tcp_bpf_get_proto(struct sock *sk, struct sk_psock *psock);
2230 : void tcp_bpf_clone(const struct sock *sk, struct sock *newsk);
2231 : #else
2232 4 : static inline void tcp_bpf_clone(const struct sock *sk, struct sock *newsk)
2233 : {
2234 4 : }
2235 : #endif /* CONFIG_BPF_STREAM_PARSER */
2236 :
2237 : #ifdef CONFIG_NET_SOCK_MSG
2238 : int tcp_bpf_sendmsg_redir(struct sock *sk, struct sk_msg *msg, u32 bytes,
2239 : int flags);
2240 : int __tcp_bpf_recvmsg(struct sock *sk, struct sk_psock *psock,
2241 : struct msghdr *msg, int len, int flags);
2242 : #endif /* CONFIG_NET_SOCK_MSG */
2243 :
2244 : #ifdef CONFIG_CGROUP_BPF
2245 : static inline void bpf_skops_init_skb(struct bpf_sock_ops_kern *skops,
2246 : struct sk_buff *skb,
2247 : unsigned int end_offset)
2248 : {
2249 : skops->skb = skb;
2250 : skops->skb_data_end = skb->data + end_offset;
2251 : }
2252 : #else
2253 : static inline void bpf_skops_init_skb(struct bpf_sock_ops_kern *skops,
2254 : struct sk_buff *skb,
2255 : unsigned int end_offset)
2256 : {
2257 : }
2258 : #endif
2259 :
2260 : /* Call BPF_SOCK_OPS program that returns an int. If the return value
2261 : * is < 0, then the BPF op failed (for example if the loaded BPF
2262 : * program does not support the chosen operation or there is no BPF
2263 : * program loaded).
2264 : */
2265 : #ifdef CONFIG_BPF
2266 15 : static inline int tcp_call_bpf(struct sock *sk, int op, u32 nargs, u32 *args)
2267 : {
2268 15 : struct bpf_sock_ops_kern sock_ops;
2269 15 : int ret;
2270 :
2271 15 : memset(&sock_ops, 0, offsetof(struct bpf_sock_ops_kern, temp));
2272 15 : if (sk_fullsock(sk)) {
2273 3 : sock_ops.is_fullsock = 1;
2274 3 : sock_owned_by_me(sk);
2275 : }
2276 :
2277 15 : sock_ops.sk = sk;
2278 15 : sock_ops.op = op;
2279 15 : if (nargs > 0)
2280 0 : memcpy(sock_ops.args, args, nargs * sizeof(*args));
2281 :
2282 15 : ret = BPF_CGROUP_RUN_PROG_SOCK_OPS(&sock_ops);
2283 15 : if (ret == 0)
2284 15 : ret = sock_ops.reply;
2285 : else
2286 : ret = -1;
2287 15 : return ret;
2288 : }
2289 :
2290 0 : static inline int tcp_call_bpf_2arg(struct sock *sk, int op, u32 arg1, u32 arg2)
2291 : {
2292 0 : u32 args[2] = {arg1, arg2};
2293 :
2294 0 : return tcp_call_bpf(sk, op, 2, args);
2295 : }
2296 :
2297 0 : static inline int tcp_call_bpf_3arg(struct sock *sk, int op, u32 arg1, u32 arg2,
2298 : u32 arg3)
2299 : {
2300 0 : u32 args[3] = {arg1, arg2, arg3};
2301 :
2302 0 : return tcp_call_bpf(sk, op, 3, args);
2303 : }
2304 :
2305 : #else
2306 : static inline int tcp_call_bpf(struct sock *sk, int op, u32 nargs, u32 *args)
2307 : {
2308 : return -EPERM;
2309 : }
2310 :
2311 : static inline int tcp_call_bpf_2arg(struct sock *sk, int op, u32 arg1, u32 arg2)
2312 : {
2313 : return -EPERM;
2314 : }
2315 :
2316 : static inline int tcp_call_bpf_3arg(struct sock *sk, int op, u32 arg1, u32 arg2,
2317 : u32 arg3)
2318 : {
2319 : return -EPERM;
2320 : }
2321 :
2322 : #endif
2323 :
2324 4 : static inline u32 tcp_timeout_init(struct sock *sk)
2325 : {
2326 4 : int timeout;
2327 :
2328 4 : timeout = tcp_call_bpf(sk, BPF_SOCK_OPS_TIMEOUT_INIT, 0, NULL);
2329 :
2330 4 : if (timeout <= 0)
2331 4 : timeout = TCP_TIMEOUT_INIT;
2332 4 : return timeout;
2333 : }
2334 :
2335 4 : static inline u32 tcp_rwnd_init_bpf(struct sock *sk)
2336 : {
2337 4 : int rwnd;
2338 :
2339 4 : rwnd = tcp_call_bpf(sk, BPF_SOCK_OPS_RWND_INIT, 0, NULL);
2340 :
2341 4 : if (rwnd < 0)
2342 : rwnd = 0;
2343 4 : return rwnd;
2344 : }
2345 :
2346 4 : static inline bool tcp_bpf_ca_needs_ecn(struct sock *sk)
2347 : {
2348 4 : return (tcp_call_bpf(sk, BPF_SOCK_OPS_NEEDS_ECN, 0, NULL) == 1);
2349 : }
2350 :
2351 258 : static inline void tcp_bpf_rtt(struct sock *sk)
2352 : {
2353 258 : if (BPF_SOCK_OPS_TEST_FLAG(tcp_sk(sk), BPF_SOCK_OPS_RTT_CB_FLAG))
2354 0 : tcp_call_bpf(sk, BPF_SOCK_OPS_RTT_CB, 0, NULL);
2355 258 : }
2356 :
2357 : #if IS_ENABLED(CONFIG_SMC)
2358 : extern struct static_key_false tcp_have_smc;
2359 : #endif
2360 :
2361 : #if IS_ENABLED(CONFIG_TLS_DEVICE)
2362 : void clean_acked_data_enable(struct inet_connection_sock *icsk,
2363 : void (*cad)(struct sock *sk, u32 ack_seq));
2364 : void clean_acked_data_disable(struct inet_connection_sock *icsk);
2365 : void clean_acked_data_flush(void);
2366 : #endif
2367 :
2368 : DECLARE_STATIC_KEY_FALSE(tcp_tx_delay_enabled);
2369 430 : static inline void tcp_add_tx_delay(struct sk_buff *skb,
2370 : const struct tcp_sock *tp)
2371 : {
2372 430 : if (static_branch_unlikely(&tcp_tx_delay_enabled))
2373 0 : skb->skb_mstamp_ns += (u64)tp->tcp_tx_delay * NSEC_PER_USEC;
2374 430 : }
2375 :
2376 : /* Compute Earliest Departure Time for some control packets
2377 : * like ACK or RST for TIME_WAIT or non ESTABLISHED sockets.
2378 : */
2379 0 : static inline u64 tcp_transmit_time(const struct sock *sk)
2380 : {
2381 0 : if (static_branch_unlikely(&tcp_tx_delay_enabled)) {
2382 0 : u32 delay = (sk->sk_state == TCP_TIME_WAIT) ?
2383 0 : tcp_twsk(sk)->tw_tx_delay : tcp_sk(sk)->tcp_tx_delay;
2384 :
2385 0 : return tcp_clock_ns() + (u64)delay * NSEC_PER_USEC;
2386 : }
2387 : return 0;
2388 : }
2389 :
2390 : #endif /* _TCP_H */
|