net/ipv4/tcp_ipv4.c

   1 /*
   2  * INET         An implementation of the TCP/IP protocol suite for the LINUX
   3  *              operating system.  INET is implemented using the  BSD Socket
   4  *              interface as the means of communication with the user level.
   5  *
   6  *              Implementation of the Transmission Control Protocol(TCP).
   7  *
   8  *              IPv4 specific functions
   9  *
  10  *
  11  *              code split from:
  12  *              linux/ipv4/tcp.c
  13  *              linux/ipv4/tcp_input.c
  14  *              linux/ipv4/tcp_output.c
  15  *
  16  *              See tcp.c for author information
  17  *
  18  *      This program is free software; you can redistribute it and/or
  19  *      modify it under the terms of the GNU General Public License
  20  *      as published by the Free Software Foundation; either version
  21  *      2 of the License, or (at your option) any later version.
  22  */
  23
  24 /*
  25  * Changes:
  26  *              David S. Miller :       New socket lookup architecture.
  27  *                                      This code is dedicated to John Dyson.
  28  *              David S. Miller :       Change semantics of established hash,
  29  *                                      half is devoted to TIME_WAIT sockets
  30  *                                      and the rest go in the other half.
  31  *              Andi Kleen :            Add support for syncookies and fixed
  32  *                                      some bugs: ip options weren't passed to
  33  *                                      the TCP layer, missed a check for an
  34  *                                      ACK bit.
  35  *              Andi Kleen :            Implemented fast path mtu discovery.
  36  *                                      Fixed many serious bugs in the
  37  *                                      request_sock handling and moved
  38  *                                      most of it into the af independent code.
  39  *                                      Added tail drop and some other bugfixes.
  40  *                                      Added new listen semantics.
  41  *              Mike McLagan    :       Routing by source
  42  *      Juan Jose Ciarlante:            ip_dynaddr bits
  43  *              Andi Kleen:             various fixes.
  44  *      Vitaly E. Lavrov        :       Transparent proxy revived after year
  45  *                                      coma.
  46  *      Andi Kleen              :       Fix new listen.
  47  *      Andi Kleen              :       Fix accept error reporting.
  48  *      YOSHIFUJI Hideaki @USAGI and:   Support IPV6_V6ONLY socket option, which
  49  *      Alexey Kuznetsov                allow both IPv4 and IPv6 sockets to bind
  50  *                                      a single port at the same time.
  51  */
  52
  53 #define pr_fmt(fmt) "TCP: " fmt
  54
  55 #include <linux/bottom_half.h>
  56 #include <linux/types.h>
  57 #include <linux/fcntl.h>
  58 #include <linux/module.h>
  59 #include <linux/random.h>
  60 #include <linux/cache.h>
  61 #include <linux/jhash.h>
  62 #include <linux/init.h>
  63 #include <linux/times.h>
  64 #include <linux/slab.h>
  65 #include <linux/locallock.h>
  66
  67 #include <net/net_namespace.h>
  68 #include <net/icmp.h>
  69 #include <net/inet_hashtables.h>
  70 #include <net/tcp.h>
  71 #include <net/transp_v6.h>
  72 #include <net/ipv6.h>
  73 #include <net/inet_common.h>
  74 #include <net/timewait_sock.h>
  75 #include <net/xfrm.h>
  76 #include <net/secure_seq.h>
  77 #include <net/busy_poll.h>
  78
  79 #include <linux/inet.h>
  80 #include <linux/ipv6.h>
  81 #include <linux/stddef.h>
  82 #include <linux/proc_fs.h>
  83 #include <linux/seq_file.h>
  84
  85 #include <crypto/hash.h>
  86 #include <linux/scatterlist.h>
  87
  88 int sysctl_tcp_tw_reuse __read_mostly;
  89 int sysctl_tcp_low_latency __read_mostly;
  90
  91 #ifdef CONFIG_TCP_MD5SIG
  92 static int tcp_v4_md5_hash_hdr(char *md5_hash, const struct tcp_md5sig_key *key,
  93                                __be32 daddr, __be32 saddr, const struct tcphdr *th);
  94 #endif
  95
  96 struct inet_hashinfo tcp_hashinfo;
  97 EXPORT_SYMBOL(tcp_hashinfo);
  98
  99 static  __u32 tcp_v4_init_sequence(const struct sk_buff *skb)
 100 {
 101         return secure_tcp_sequence_number(ip_hdr(skb)->daddr,
 102                                           ip_hdr(skb)->saddr,
 103                                           tcp_hdr(skb)->dest,
 104                                           tcp_hdr(skb)->source);
 105 }
 106
 107 int tcp_twsk_unique(struct sock *sk, struct sock *sktw, void *twp)
 108 {
 109         const struct tcp_timewait_sock *tcptw = tcp_twsk(sktw);
 110         struct tcp_sock *tp = tcp_sk(sk);
 111
 112         /* With PAWS, it is safe from the viewpoint
 113            of data integrity. Even without PAWS it is safe provided sequence
 114            spaces do not overlap i.e. at data rates <= 80Mbit/sec.
 115
 116            Actually, the idea is close to VJ's one, only timestamp cache is
 117            held not per host, but per port pair and TW bucket is used as state
 118            holder.
 119
 120            If TW bucket has been already destroyed we fall back to VJ's scheme
 121            and use initial timestamp retrieved from peer table.
 122          */
 123         if (tcptw->tw_ts_recent_stamp &&
 124             (!twp || (sysctl_tcp_tw_reuse &&
 125                              get_seconds() - tcptw->tw_ts_recent_stamp > 1))) {
 126                 tp->write_seq = tcptw->tw_snd_nxt + 65535 + 2;
 127                 if (tp->write_seq == 0)
 128                         tp->write_seq = 1;
 129                 tp->rx_opt.ts_recent       = tcptw->tw_ts_recent;
 130                 tp->rx_opt.ts_recent_stamp = tcptw->tw_ts_recent_stamp;
 131                 sock_hold(sktw);
 132                 return 1;
 133         }
 134
 135         return 0;
 136 }
 137 EXPORT_SYMBOL_GPL(tcp_twsk_unique);
 138
 139 /* This will initiate an outgoing connection. */
 140 int tcp_v4_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len)
 141 {
 142         struct sockaddr_in *usin = (struct sockaddr_in *)uaddr;
 143         struct inet_sock *inet = inet_sk(sk);
 144         struct tcp_sock *tp = tcp_sk(sk);
 145         __be16 orig_sport, orig_dport;
 146         __be32 daddr, nexthop;
 147         struct flowi4 *fl4;
 148         struct rtable *rt;
 149         int err;
 150         struct ip_options_rcu *inet_opt;
 151
 152         if (addr_len < sizeof(struct sockaddr_in))
 153                 return -EINVAL;
 154
 155         if (usin->sin_family != AF_INET)
 156                 return -EAFNOSUPPORT;
 157
 158         nexthop = daddr = usin->sin_addr.s_addr;
 159         inet_opt = rcu_dereference_protected(inet->inet_opt,
 160                                              lockdep_sock_is_held(sk));
 161         if (inet_opt && inet_opt->opt.srr) {
 162                 if (!daddr)
 163                         return -EINVAL;
 164                 nexthop = inet_opt->opt.faddr;
 165         }
 166
 167         orig_sport = inet->inet_sport;
 168         orig_dport = usin->sin_port;
 169         fl4 = &inet->cork.fl.u.ip4;
 170         rt = ip_route_connect(fl4, nexthop, inet->inet_saddr,
 171                               RT_CONN_FLAGS(sk), sk->sk_bound_dev_if,
 172                               IPPROTO_TCP,
 173                               orig_sport, orig_dport, sk);
 174         if (IS_ERR(rt)) {
 175                 err = PTR_ERR(rt);
 176                 if (err == -ENETUNREACH)
 177                         IP_INC_STATS(sock_net(sk), IPSTATS_MIB_OUTNOROUTES);
 178                 return err;
 179         }
 180
 181         if (rt->rt_flags & (RTCF_MULTICAST | RTCF_BROADCAST)) {
 182                 ip_rt_put(rt);
 183                 return -ENETUNREACH;
 184         }
 185
 186         if (!inet_opt || !inet_opt->opt.srr)
 187                 daddr = fl4->daddr;
 188
 189         if (!inet->inet_saddr)
 190                 inet->inet_saddr = fl4->saddr;
 191         sk_rcv_saddr_set(sk, inet->inet_saddr);
 192
 193         if (tp->rx_opt.ts_recent_stamp && inet->inet_daddr != daddr) {
 194                 /* Reset inherited state */
 195                 tp->rx_opt.ts_recent       = 0;
 196                 tp->rx_opt.ts_recent_stamp = 0;
 197                 if (likely(!tp->repair))
 198                         tp->write_seq      = 0;
 199         }
 200
 201         if (tcp_death_row.sysctl_tw_recycle &&
 202             !tp->rx_opt.ts_recent_stamp && fl4->daddr == daddr)
 203                 tcp_fetch_timewait_stamp(sk, &rt->dst);
 204
 205         inet->inet_dport = usin->sin_port;
 206         sk_daddr_set(sk, daddr);
 207
 208         inet_csk(sk)->icsk_ext_hdr_len = 0;
 209         if (inet_opt)
 210                 inet_csk(sk)->icsk_ext_hdr_len = inet_opt->opt.optlen;
 211
 212         tp->rx_opt.mss_clamp = TCP_MSS_DEFAULT;
 213
 214         /* Socket identity is still unknown (sport may be zero).
 215          * However we set state to SYN-SENT and not releasing socket
 216          * lock select source port, enter ourselves into the hash tables and
 217          * complete initialization after this.
 218          */
 219         tcp_set_state(sk, TCP_SYN_SENT);
 220         err = inet_hash_connect(&tcp_death_row, sk);
 221         if (err)
 222                 goto failure;
 223
 224         sk_set_txhash(sk);
 225
 226         rt = ip_route_newports(fl4, rt, orig_sport, orig_dport,
 227                                inet->inet_sport, inet->inet_dport, sk);
 228         if (IS_ERR(rt)) {
 229                 err = PTR_ERR(rt);
 230                 rt = NULL;
 231                 goto failure;
 232         }
 233         /* OK, now commit destination to socket.  */
 234         sk->sk_gso_type = SKB_GSO_TCPV4;
 235         sk_setup_caps(sk, &rt->dst);
 236
 237         if (!tp->write_seq && likely(!tp->repair))
 238                 tp->write_seq = secure_tcp_sequence_number(inet->inet_saddr,
 239                                                            inet->inet_daddr,
 240                                                            inet->inet_sport,
 241                                                            usin->sin_port);
 242
 243         inet->inet_id = tp->write_seq ^ jiffies;
 244
 245         err = tcp_connect(sk);
 246
 247         rt = NULL;
 248         if (err)
 249                 goto failure;
 250
 251         return 0;
 252
 253 failure:
 254         /*
 255          * This unhashes the socket and releases the local port,
 256          * if necessary.
 257          */
 258         tcp_set_state(sk, TCP_CLOSE);
 259         ip_rt_put(rt);
 260         sk->sk_route_caps = 0;
 261         inet->inet_dport = 0;
 262         return err;
 263 }
 264 EXPORT_SYMBOL(tcp_v4_connect);
 265
 266 /*
 267  * This routine reacts to ICMP_FRAG_NEEDED mtu indications as defined in RFC1191.
 268  * It can be called through tcp_release_cb() if socket was owned by user
 269  * at the time tcp_v4_err() was called to handle ICMP message.
 270  */
 271 void tcp_v4_mtu_reduced(struct sock *sk)
 272 {
 273         struct dst_entry *dst;
 274         struct inet_sock *inet = inet_sk(sk);
 275         u32 mtu = tcp_sk(sk)->mtu_info;
 276
 277         dst = inet_csk_update_pmtu(sk, mtu);
 278         if (!dst)
 279                 return;
 280
 281         /* Something is about to be wrong... Remember soft error
 282          * for the case, if this connection will not able to recover.
 283          */
 284         if (mtu < dst_mtu(dst) && ip_dont_fragment(sk, dst))
 285                 sk->sk_err_soft = EMSGSIZE;
 286
 287         mtu = dst_mtu(dst);
 288
 289         if (inet->pmtudisc != IP_PMTUDISC_DONT &&
 290             ip_sk_accept_pmtu(sk) &&
 291             inet_csk(sk)->icsk_pmtu_cookie > mtu) {
 292                 tcp_sync_mss(sk, mtu);
 293
 294                 /* Resend the TCP packet because it's
 295                  * clear that the old packet has been
 296                  * dropped. This is the new "fast" path mtu
 297                  * discovery.
 298                  */
 299                 tcp_simple_retransmit(sk);
 300         } /* else let the usual retransmit timer handle it */
 301 }
 302 EXPORT_SYMBOL(tcp_v4_mtu_reduced);
 303
 304 static void do_redirect(struct sk_buff *skb, struct sock *sk)
 305 {
 306         struct dst_entry *dst = __sk_dst_check(sk, 0);
 307
 308         if (dst)
 309                 dst->ops->redirect(dst, sk, skb);
 310 }
 311
 312
 313 /* handle ICMP messages on TCP_NEW_SYN_RECV request sockets */
 314 void tcp_req_err(struct sock *sk, u32 seq, bool abort)
 315 {
 316         struct request_sock *req = inet_reqsk(sk);
 317         struct net *net = sock_net(sk);
 318
 319         /* ICMPs are not backlogged, hence we cannot get
 320          * an established socket here.
 321          */
 322         if (seq != tcp_rsk(req)->snt_isn) {
 323                 __NET_INC_STATS(net, LINUX_MIB_OUTOFWINDOWICMPS);
 324         } else if (abort) {
 325                 /*
 326                  * Still in SYN_RECV, just remove it silently.
 327                  * There is no good way to pass the error to the newly
 328                  * created socket, and POSIX does not want network
 329                  * errors returned from accept().
 330                  */
 331                 inet_csk_reqsk_queue_drop(req->rsk_listener, req);
 332                 tcp_listendrop(req->rsk_listener);
 333         }
 334         reqsk_put(req);
 335 }
 336 EXPORT_SYMBOL(tcp_req_err);
 337
 338 /*
 339  * This routine is called by the ICMP module when it gets some
 340  * sort of error condition.  If err < 0 then the socket should
 341  * be closed and the error returned to the user.  If err > 0
 342  * it's just the icmp type << 8 | icmp code.  After adjustment
 343  * header points to the first 8 bytes of the tcp header.  We need
 344  * to find the appropriate port.
 345  *
 346  * The locking strategy used here is very "optimistic". When
 347  * someone else accesses the socket the ICMP is just dropped
 348  * and for some paths there is no check at all.
 349  * A more general error queue to queue errors for later handling
 350  * is probably better.
 351  *
 352  */
 353
 354 void tcp_v4_err(struct sk_buff *icmp_skb, u32 info)
 355 {
 356         const struct iphdr *iph = (const struct iphdr *)icmp_skb->data;
 357         struct tcphdr *th = (struct tcphdr *)(icmp_skb->data + (iph->ihl << 2));
 358         struct inet_connection_sock *icsk;
 359         struct tcp_sock *tp;
 360         struct inet_sock *inet;
 361         const int type = icmp_hdr(icmp_skb)->type;
 362         const int code = icmp_hdr(icmp_skb)->code;
 363         struct sock *sk;
 364         struct sk_buff *skb;
 365         struct request_sock *fastopen;
 366         __u32 seq, snd_una;
 367         __u32 remaining;
 368         int err;
 369         struct net *net = dev_net(icmp_skb->dev);
 370
 371         sk = __inet_lookup_established(net, &tcp_hashinfo, iph->daddr,
 372                                        th->dest, iph->saddr, ntohs(th->source),
 373                                        inet_iif(icmp_skb));
 374         if (!sk) {
 375                 __ICMP_INC_STATS(net, ICMP_MIB_INERRORS);
 376                 return;
 377         }
 378         if (sk->sk_state == TCP_TIME_WAIT) {
 379                 inet_twsk_put(inet_twsk(sk));
 380                 return;
 381         }
 382         seq = ntohl(th->seq);
 383         if (sk->sk_state == TCP_NEW_SYN_RECV)
 384                 return tcp_req_err(sk, seq,
 385                                   type == ICMP_PARAMETERPROB ||
 386                                   type == ICMP_TIME_EXCEEDED ||
 387                                   (type == ICMP_DEST_UNREACH &&
 388                                    (code == ICMP_NET_UNREACH ||
 389                                     code == ICMP_HOST_UNREACH)));
 390
 391         bh_lock_sock(sk);
 392         /* If too many ICMPs get dropped on busy
 393          * servers this needs to be solved differently.
 394          * We do take care of PMTU discovery (RFC1191) special case :
 395          * we can receive locally generated ICMP messages while socket is held.
 396          */
 397         if (sock_owned_by_user(sk)) {
 398                 if (!(type == ICMP_DEST_UNREACH && code == ICMP_FRAG_NEEDED))
 399                         __NET_INC_STATS(net, LINUX_MIB_LOCKDROPPEDICMPS);
 400         }
 401         if (sk->sk_state == TCP_CLOSE)
 402                 goto out;
 403
 404         if (unlikely(iph->ttl < inet_sk(sk)->min_ttl)) {
 405                 __NET_INC_STATS(net, LINUX_MIB_TCPMINTTLDROP);
 406                 goto out;
 407         }
 408
 409         icsk = inet_csk(sk);
 410         tp = tcp_sk(sk);
 411         /* XXX (TFO) - tp->snd_una should be ISN (tcp_create_openreq_child() */
 412         fastopen = tp->fastopen_rsk;
 413         snd_una = fastopen ? tcp_rsk(fastopen)->snt_isn : tp->snd_una;
 414         if (sk->sk_state != TCP_LISTEN &&
 415             !between(seq, snd_una, tp->snd_nxt)) {
 416                 __NET_INC_STATS(net, LINUX_MIB_OUTOFWINDOWICMPS);
 417                 goto out;
 418         }
 419
 420         switch (type) {
 421         case ICMP_REDIRECT:
 422                 do_redirect(icmp_skb, sk);
 423                 goto out;
 424         case ICMP_SOURCE_QUENCH:
 425                 /* Just silently ignore these. */
 426                 goto out;
 427         case ICMP_PARAMETERPROB:
 428                 err = EPROTO;
 429                 break;
 430         case ICMP_DEST_UNREACH:
 431                 if (code > NR_ICMP_UNREACH)
 432                         goto out;
 433
 434                 if (code == ICMP_FRAG_NEEDED) { /* PMTU discovery (RFC1191) */
 435                         /* We are not interested in TCP_LISTEN and open_requests
 436                          * (SYN-ACKs send out by Linux are always <576bytes so
 437                          * they should go through unfragmented).
 438                          */
 439                         if (sk->sk_state == TCP_LISTEN)
 440                                 goto out;
 441
 442                         tp->mtu_info = info;
 443                         if (!sock_owned_by_user(sk)) {
 444                                 tcp_v4_mtu_reduced(sk);
 445                         } else {
 446                                 if (!test_and_set_bit(TCP_MTU_REDUCED_DEFERRED, &tp->tsq_flags))
 447                                         sock_hold(sk);
 448                         }
 449                         goto out;
 450                 }
 451
 452                 err = icmp_err_convert[code].errno;
 453                 /* check if icmp_skb allows revert of backoff
 454                  * (see draft-zimmermann-tcp-lcd) */
 455                 if (code != ICMP_NET_UNREACH && code != ICMP_HOST_UNREACH)
 456                         break;
 457                 if (seq != tp->snd_una  || !icsk->icsk_retransmits ||
 458                     !icsk->icsk_backoff || fastopen)
 459                         break;
 460
 461                 if (sock_owned_by_user(sk))
 462                         break;
 463
 464                 icsk->icsk_backoff--;
 465                 icsk->icsk_rto = tp->srtt_us ? __tcp_set_rto(tp) :
 466                                                TCP_TIMEOUT_INIT;
 467                 icsk->icsk_rto = inet_csk_rto_backoff(icsk, TCP_RTO_MAX);
 468
 469                 skb = tcp_write_queue_head(sk);
 470                 BUG_ON(!skb);
 471
 472                 remaining = icsk->icsk_rto -
 473                             min(icsk->icsk_rto,
 474                                 tcp_time_stamp - tcp_skb_timestamp(skb));
 475
 476                 if (remaining) {
 477                         inet_csk_reset_xmit_timer(sk, ICSK_TIME_RETRANS,
 478                                                   remaining, TCP_RTO_MAX);
 479                 } else {
 480                         /* RTO revert clocked out retransmission.
 481                          * Will retransmit now */
 482                         tcp_retransmit_timer(sk);
 483                 }
 484
 485                 break;
 486         case ICMP_TIME_EXCEEDED:
 487                 err = EHOSTUNREACH;
 488                 break;
 489         default:
 490                 goto out;
 491         }
 492
 493         switch (sk->sk_state) {
 494         case TCP_SYN_SENT:
 495         case TCP_SYN_RECV:
 496                 /* Only in fast or simultaneous open. If a fast open socket is
 497                  * is already accepted it is treated as a connected one below.
 498                  */
 499                 if (fastopen && !fastopen->sk)
 500                         break;
 501
 502                 if (!sock_owned_by_user(sk)) {
 503                         sk->sk_err = err;
 504
 505                         sk->sk_error_report(sk);
 506
 507                         tcp_done(sk);
 508                 } else {
 509                         sk->sk_err_soft = err;
 510                 }
 511                 goto out;
 512         }
 513
 514         /* If we've already connected we will keep trying
 515          * until we time out, or the user gives up.
 516          *
 517          * rfc1122 4.2.3.9 allows to consider as hard errors
 518          * only PROTO_UNREACH and PORT_UNREACH (well, FRAG_FAILED too,
 519          * but it is obsoleted by pmtu discovery).
 520          *
 521          * Note, that in modern internet, where routing is unreliable
 522          * and in each dark corner broken firewalls sit, sending random
 523          * errors ordered by their masters even this two messages finally lose
 524          * their original sense (even Linux sends invalid PORT_UNREACHs)
 525          *
 526          * Now we are in compliance with RFCs.
 527          *                                                      --ANK (980905)
 528          */
 529
 530         inet = inet_sk(sk);
 531         if (!sock_owned_by_user(sk) && inet->recverr) {
 532                 sk->sk_err = err;
 533                 sk->sk_error_report(sk);
 534         } else  { /* Only an error on timeout */
 535                 sk->sk_err_soft = err;
 536         }
 537
 538 out:
 539         bh_unlock_sock(sk);
 540         sock_put(sk);
 541 }
 542
 543 void __tcp_v4_send_check(struct sk_buff *skb, __be32 saddr, __be32 daddr)
 544 {
 545         struct tcphdr *th = tcp_hdr(skb);
 546
 547         if (skb->ip_summed == CHECKSUM_PARTIAL) {
 548                 th->check = ~tcp_v4_check(skb->len, saddr, daddr, 0);
 549                 skb->csum_start = skb_transport_header(skb) - skb->head;
 550                 skb->csum_offset = offsetof(struct tcphdr, check);
 551         } else {
 552                 th->check = tcp_v4_check(skb->len, saddr, daddr,
 553                                          csum_partial(th,
 554                                                       th->doff << 2,
 555                                                       skb->csum));
 556         }
 557 }
 558
 559 /* This routine computes an IPv4 TCP checksum. */
 560 void tcp_v4_send_check(struct sock *sk, struct sk_buff *skb)
 561 {
 562         const struct inet_sock *inet = inet_sk(sk);
 563
 564         __tcp_v4_send_check(skb, inet->inet_saddr, inet->inet_daddr);
 565 }
 566 EXPORT_SYMBOL(tcp_v4_send_check);
 567
 568 static DEFINE_LOCAL_IRQ_LOCK(tcp_sk_lock);
 569 /*
 570  *      This routine will send an RST to the other tcp.
 571  *
 572  *      Someone asks: why I NEVER use socket parameters (TOS, TTL etc.)
 573  *                    for reset.
 574  *      Answer: if a packet caused RST, it is not for a socket
 575  *              existing in our system, if it is matched to a socket,
 576  *              it is just duplicate segment or bug in other side's TCP.
 577  *              So that we build reply only basing on parameters
 578  *              arrived with segment.
 579  *      Exception: precedence violation. We do not implement it in any case.
 580  */
 581
 582 static void tcp_v4_send_reset(const struct sock *sk, struct sk_buff *skb)
 583 {
 584         const struct tcphdr *th = tcp_hdr(skb);
 585         struct {
 586                 struct tcphdr th;
 587 #ifdef CONFIG_TCP_MD5SIG
 588                 __be32 opt[(TCPOLEN_MD5SIG_ALIGNED >> 2)];
 589 #endif
 590         } rep;
 591         struct ip_reply_arg arg;
 592 #ifdef CONFIG_TCP_MD5SIG
 593         struct tcp_md5sig_key *key = NULL;
 594         const __u8 *hash_location = NULL;
 595         unsigned char newhash[16];
 596         int genhash;
 597         struct sock *sk1 = NULL;
 598 #endif
 599         struct net *net;
 600
 601         /* Never send a reset in response to a reset. */
 602         if (th->rst)
 603                 return;
 604
 605         /* If sk not NULL, it means we did a successful lookup and incoming
 606          * route had to be correct. prequeue might have dropped our dst.
 607          */
 608         if (!sk && skb_rtable(skb)->rt_type != RTN_LOCAL)
 609                 return;
 610
 611         /* Swap the send and the receive. */
 612         memset(&rep, 0, sizeof(rep));
 613         rep.th.dest   = th->source;
 614         rep.th.source = th->dest;
 615         rep.th.doff   = sizeof(struct tcphdr) / 4;
 616         rep.th.rst    = 1;
 617
 618         if (th->ack) {
 619                 rep.th.seq = th->ack_seq;
 620         } else {
 621                 rep.th.ack = 1;
 622                 rep.th.ack_seq = htonl(ntohl(th->seq) + th->syn + th->fin +
 623                                        skb->len - (th->doff << 2));
 624         }
 625
 626         memset(&arg, 0, sizeof(arg));
 627         arg.iov[0].iov_base = (unsigned char *)&rep;
 628         arg.iov[0].iov_len  = sizeof(rep.th);
 629
 630         net = sk ? sock_net(sk) : dev_net(skb_dst(skb)->dev);
 631 #ifdef CONFIG_TCP_MD5SIG
 632         rcu_read_lock();
 633         hash_location = tcp_parse_md5sig_option(th);
 634         if (sk && sk_fullsock(sk)) {
 635                 key = tcp_md5_do_lookup(sk, (union tcp_md5_addr *)
 636                                         &ip_hdr(skb)->saddr, AF_INET);
 637         } else if (hash_location) {
 638                 /*
 639                  * active side is lost. Try to find listening socket through
 640                  * source port, and then find md5 key through listening socket.
 641                  * we are not loose security here:
 642                  * Incoming packet is checked with md5 hash with finding key,
 643                  * no RST generated if md5 hash doesn't match.
 644                  */
 645                 sk1 = __inet_lookup_listener(net, &tcp_hashinfo, NULL, 0,
 646                                              ip_hdr(skb)->saddr,
 647                                              th->source, ip_hdr(skb)->daddr,
 648                                              ntohs(th->source), inet_iif(skb));
 649                 /* don't send rst if it can't find key */
 650                 if (!sk1)
 651                         goto out;
 652
 653                 key = tcp_md5_do_lookup(sk1, (union tcp_md5_addr *)
 654                                         &ip_hdr(skb)->saddr, AF_INET);
 655                 if (!key)
 656                         goto out;
 657
 658
 659                 genhash = tcp_v4_md5_hash_skb(newhash, key, NULL, skb);
 660                 if (genhash || memcmp(hash_location, newhash, 16) != 0)
 661                         goto out;
 662
 663         }
 664
 665         if (key) {
 666                 rep.opt[0] = htonl((TCPOPT_NOP << 24) |
 667                                    (TCPOPT_NOP << 16) |
 668                                    (TCPOPT_MD5SIG << 8) |
 669                                    TCPOLEN_MD5SIG);
 670                 /* Update length and the length the header thinks exists */
 671                 arg.iov[0].iov_len += TCPOLEN_MD5SIG_ALIGNED;
 672                 rep.th.doff = arg.iov[0].iov_len / 4;
 673
 674                 tcp_v4_md5_hash_hdr((__u8 *) &rep.opt[1],
 675                                      key, ip_hdr(skb)->saddr,
 676                                      ip_hdr(skb)->daddr, &rep.th);
 677         }
 678 #endif
 679         arg.csum = csum_tcpudp_nofold(ip_hdr(skb)->daddr,
 680                                       ip_hdr(skb)->saddr, /* XXX */
 681                                       arg.iov[0].iov_len, IPPROTO_TCP, 0);
 682         arg.csumoffset = offsetof(struct tcphdr, check) / 2;
 683         arg.flags = (sk && inet_sk_transparent(sk)) ? IP_REPLY_ARG_NOSRCCHECK : 0;
 684
 685         /* When socket is gone, all binding information is lost.
 686          * routing might fail in this case. No choice here, if we choose to force
 687          * input interface, we will misroute in case of asymmetric route.
 688          */
 689         if (sk)
 690                 arg.bound_dev_if = sk->sk_bound_dev_if;
 691
 692         BUILD_BUG_ON(offsetof(struct sock, sk_bound_dev_if) !=
 693                      offsetof(struct inet_timewait_sock, tw_bound_dev_if));
 694
 695         arg.tos = ip_hdr(skb)->tos;
 696
 697         local_lock(tcp_sk_lock);
 698         local_bh_disable();
 699         ip_send_unicast_reply(*this_cpu_ptr(net->ipv4.tcp_sk),
 700                               skb, &TCP_SKB_CB(skb)->header.h4.opt,
 701                               ip_hdr(skb)->saddr, ip_hdr(skb)->daddr,
 702                               &arg, arg.iov[0].iov_len);
 703
 704         __TCP_INC_STATS(net, TCP_MIB_OUTSEGS);
 705         __TCP_INC_STATS(net, TCP_MIB_OUTRSTS);
 706         local_bh_enable();
 707         local_unlock(tcp_sk_lock);
 708
 709 #ifdef CONFIG_TCP_MD5SIG
 710 out:
 711         rcu_read_unlock();
 712 #endif
 713 }
 714
 715 /* The code following below sending ACKs in SYN-RECV and TIME-WAIT states
 716    outside socket context is ugly, certainly. What can I do?
 717  */
 718
 719 static void tcp_v4_send_ack(struct net *net,
 720                             struct sk_buff *skb, u32 seq, u32 ack,
 721                             u32 win, u32 tsval, u32 tsecr, int oif,
 722                             struct tcp_md5sig_key *key,
 723                             int reply_flags, u8 tos)
 724 {
 725         const struct tcphdr *th = tcp_hdr(skb);
 726         struct {
 727                 struct tcphdr th;
 728                 __be32 opt[(TCPOLEN_TSTAMP_ALIGNED >> 2)
 729 #ifdef CONFIG_TCP_MD5SIG
 730                            + (TCPOLEN_MD5SIG_ALIGNED >> 2)
 731 #endif
 732                         ];
 733         } rep;
 734         struct ip_reply_arg arg;
 735
 736         memset(&rep.th, 0, sizeof(struct tcphdr));
 737         memset(&arg, 0, sizeof(arg));
 738
 739         arg.iov[0].iov_base = (unsigned char *)&rep;
 740         arg.iov[0].iov_len  = sizeof(rep.th);
 741         if (tsecr) {
 742                 rep.opt[0] = htonl((TCPOPT_NOP << 24) | (TCPOPT_NOP << 16) |
 743                                    (TCPOPT_TIMESTAMP << 8) |
 744                                    TCPOLEN_TIMESTAMP);
 745                 rep.opt[1] = htonl(tsval);
 746                 rep.opt[2] = htonl(tsecr);
 747                 arg.iov[0].iov_len += TCPOLEN_TSTAMP_ALIGNED;
 748         }
 749
 750         /* Swap the send and the receive. */
 751         rep.th.dest    = th->source;
 752         rep.th.source  = th->dest;
 753         rep.th.doff    = arg.iov[0].iov_len / 4;
 754         rep.th.seq     = htonl(seq);
 755         rep.th.ack_seq = htonl(ack);
 756         rep.th.ack     = 1;
 757         rep.th.window  = htons(win);
 758
 759 #ifdef CONFIG_TCP_MD5SIG
 760         if (key) {
 761                 int offset = (tsecr) ? 3 : 0;
 762
 763                 rep.opt[offset++] = htonl((TCPOPT_NOP << 24) |
 764                                           (TCPOPT_NOP << 16) |
 765                                           (TCPOPT_MD5SIG << 8) |
 766                                           TCPOLEN_MD5SIG);
 767                 arg.iov[0].iov_len += TCPOLEN_MD5SIG_ALIGNED;
 768                 rep.th.doff = arg.iov[0].iov_len/4;
 769
 770                 tcp_v4_md5_hash_hdr((__u8 *) &rep.opt[offset],
 771                                     key, ip_hdr(skb)->saddr,
 772                                     ip_hdr(skb)->daddr, &rep.th);
 773         }
 774 #endif
 775         arg.flags = reply_flags;
 776         arg.csum = csum_tcpudp_nofold(ip_hdr(skb)->daddr,
 777                                       ip_hdr(skb)->saddr, /* XXX */
 778                                       arg.iov[0].iov_len, IPPROTO_TCP, 0);
 779         arg.csumoffset = offsetof(struct tcphdr, check) / 2;
 780         if (oif)
 781                 arg.bound_dev_if = oif;
 782         arg.tos = tos;
 783         local_lock(tcp_sk_lock);
 784         local_bh_disable();
 785         ip_send_unicast_reply(*this_cpu_ptr(net->ipv4.tcp_sk),
 786                               skb, &TCP_SKB_CB(skb)->header.h4.opt,
 787                               ip_hdr(skb)->saddr, ip_hdr(skb)->daddr,
 788                               &arg, arg.iov[0].iov_len);
 789
 790         __TCP_INC_STATS(net, TCP_MIB_OUTSEGS);
 791         local_bh_enable();
 792         local_unlock(tcp_sk_lock);
 793 }
 794
 795 static void tcp_v4_timewait_ack(struct sock *sk, struct sk_buff *skb)
 796 {
 797         struct inet_timewait_sock *tw = inet_twsk(sk);
 798         struct tcp_timewait_sock *tcptw = tcp_twsk(sk);
 799
 800         tcp_v4_send_ack(sock_net(sk), skb,
 801                         tcptw->tw_snd_nxt, tcptw->tw_rcv_nxt,
 802                         tcptw->tw_rcv_wnd >> tw->tw_rcv_wscale,
 803                         tcp_time_stamp + tcptw->tw_ts_offset,
 804                         tcptw->tw_ts_recent,
 805                         tw->tw_bound_dev_if,
 806                         tcp_twsk_md5_key(tcptw),
 807                         tw->tw_transparent ? IP_REPLY_ARG_NOSRCCHECK : 0,
 808                         tw->tw_tos
 809                         );
 810
 811         inet_twsk_put(tw);
 812 }
 813
 814 static void tcp_v4_reqsk_send_ack(const struct sock *sk, struct sk_buff *skb,
 815                                   struct request_sock *req)
 816 {
 817         /* sk->sk_state == TCP_LISTEN -> for regular TCP_SYN_RECV
 818          * sk->sk_state == TCP_SYN_RECV -> for Fast Open.
 819          */
 820         u32 seq = (sk->sk_state == TCP_LISTEN) ? tcp_rsk(req)->snt_isn + 1 :
 821                                              tcp_sk(sk)->snd_nxt;
 822
 823         /* RFC 7323 2.3
 824          * The window field (SEG.WND) of every outgoing segment, with the
 825          * exception of <SYN> segments, MUST be right-shifted by
 826          * Rcv.Wind.Shift bits:
 827          */
 828         tcp_v4_send_ack(sock_net(sk), skb, seq,
 829                         tcp_rsk(req)->rcv_nxt,
 830                         req->rsk_rcv_wnd >> inet_rsk(req)->rcv_wscale,
 831                         tcp_time_stamp,
 832                         req->ts_recent,
 833                         0,
 834                         tcp_md5_do_lookup(sk, (union tcp_md5_addr *)&ip_hdr(skb)->daddr,
 835                                           AF_INET),
 836                         inet_rsk(req)->no_srccheck ? IP_REPLY_ARG_NOSRCCHECK : 0,
 837                         ip_hdr(skb)->tos);
 838 }
 839
 840 /*
 841  *      Send a SYN-ACK after having received a SYN.
 842  *      This still operates on a request_sock only, not on a big
 843  *      socket.
 844  */
 845 static int tcp_v4_send_synack(const struct sock *sk, struct dst_entry *dst,
 846                               struct flowi *fl,
 847                               struct request_sock *req,
 848                               struct tcp_fastopen_cookie *foc,
 849                               enum tcp_synack_type synack_type)
 850 {
 851         const struct inet_request_sock *ireq = inet_rsk(req);
 852         struct flowi4 fl4;
 853         int err = -1;
 854         struct sk_buff *skb;
 855
 856         /* First, grab a route. */
 857         if (!dst && (dst = inet_csk_route_req(sk, &fl4, req)) == NULL)
 858                 return -1;
 859
 860         skb = tcp_make_synack(sk, dst, req, foc, synack_type);
 861
 862         if (skb) {
 863                 __tcp_v4_send_check(skb, ireq->ir_loc_addr, ireq->ir_rmt_addr);
 864
 865                 err = ip_build_and_send_pkt(skb, sk, ireq->ir_loc_addr,
 866                                             ireq->ir_rmt_addr,
 867                                             ireq->opt);
 868                 err = net_xmit_eval(err);
 869         }
 870
 871         return err;
 872 }
 873
 874 /*
 875  *      IPv4 request_sock destructor.
 876  */
 877 static void tcp_v4_reqsk_destructor(struct request_sock *req)
 878 {
 879         kfree(inet_rsk(req)->opt);
 880 }
 881
 882 #ifdef CONFIG_TCP_MD5SIG
 883 /*
 884  * RFC2385 MD5 checksumming requires a mapping of
 885  * IP address->MD5 Key.
 886  * We need to maintain these in the sk structure.
 887  */
 888
 889 /* Find the Key structure for an address.  */
 890 struct tcp_md5sig_key *tcp_md5_do_lookup(const struct sock *sk,
 891                                          const union tcp_md5_addr *addr,
 892                                          int family)
 893 {
 894         const struct tcp_sock *tp = tcp_sk(sk);
 895         struct tcp_md5sig_key *key;
 896         unsigned int size = sizeof(struct in_addr);
 897         const struct tcp_md5sig_info *md5sig;
 898
 899         /* caller either holds rcu_read_lock() or socket lock */
 900         md5sig = rcu_dereference_check(tp->md5sig_info,
 901                                        lockdep_sock_is_held(sk));
 902         if (!md5sig)
 903                 return NULL;
 904 #if IS_ENABLED(CONFIG_IPV6)
 905         if (family == AF_INET6)
 906                 size = sizeof(struct in6_addr);
 907 #endif
 908         hlist_for_each_entry_rcu(key, &md5sig->head, node) {
 909                 if (key->family != family)
 910                         continue;
 911                 if (!memcmp(&key->addr, addr, size))
 912                         return key;
 913         }
 914         return NULL;
 915 }
 916 EXPORT_SYMBOL(tcp_md5_do_lookup);
 917
 918 struct tcp_md5sig_key *tcp_v4_md5_lookup(const struct sock *sk,
 919                                          const struct sock *addr_sk)
 920 {
 921         const union tcp_md5_addr *addr;
 922
 923         addr = (const union tcp_md5_addr *)&addr_sk->sk_daddr;
 924         return tcp_md5_do_lookup(sk, addr, AF_INET);
 925 }
 926 EXPORT_SYMBOL(tcp_v4_md5_lookup);
 927
 928 /* This can be called on a newly created socket, from other files */
 929 int tcp_md5_do_add(struct sock *sk, const union tcp_md5_addr *addr,
 930                    int family, const u8 *newkey, u8 newkeylen, gfp_t gfp)
 931 {
 932         /* Add Key to the list */
 933         struct tcp_md5sig_key *key;
 934         struct tcp_sock *tp = tcp_sk(sk);
 935         struct tcp_md5sig_info *md5sig;
 936
 937         key = tcp_md5_do_lookup(sk, addr, family);
 938         if (key) {
 939                 /* Pre-existing entry - just update that one. */
 940                 memcpy(key->key, newkey, newkeylen);
 941                 key->keylen = newkeylen;
 942                 return 0;
 943         }
 944
 945         md5sig = rcu_dereference_protected(tp->md5sig_info,
 946                                            lockdep_sock_is_held(sk));
 947         if (!md5sig) {
 948                 md5sig = kmalloc(sizeof(*md5sig), gfp);
 949                 if (!md5sig)
 950                         return -ENOMEM;
 951
 952                 sk_nocaps_add(sk, NETIF_F_GSO_MASK);
 953                 INIT_HLIST_HEAD(&md5sig->head);
 954                 rcu_assign_pointer(tp->md5sig_info, md5sig);
 955         }
 956
 957         key = sock_kmalloc(sk, sizeof(*key), gfp);
 958         if (!key)
 959                 return -ENOMEM;
 960         if (!tcp_alloc_md5sig_pool()) {
 961                 sock_kfree_s(sk, key, sizeof(*key));
 962                 return -ENOMEM;
 963         }
 964
 965         memcpy(key->key, newkey, newkeylen);
 966         key->keylen = newkeylen;
 967         key->family = family;
 968         memcpy(&key->addr, addr,
 969                (family == AF_INET6) ? sizeof(struct in6_addr) :
 970                                       sizeof(struct in_addr));
 971         hlist_add_head_rcu(&key->node, &md5sig->head);
 972         return 0;
 973 }
 974 EXPORT_SYMBOL(tcp_md5_do_add);
 975
 976 int tcp_md5_do_del(struct sock *sk, const union tcp_md5_addr *addr, int family)
 977 {
 978         struct tcp_md5sig_key *key;
 979
 980         key = tcp_md5_do_lookup(sk, addr, family);
 981         if (!key)
 982                 return -ENOENT;
 983         hlist_del_rcu(&key->node);
 984         atomic_sub(sizeof(*key), &sk->sk_omem_alloc);
 985         kfree_rcu(key, rcu);
 986         return 0;
 987 }
 988 EXPORT_SYMBOL(tcp_md5_do_del);
 989
 990 static void tcp_clear_md5_list(struct sock *sk)
 991 {
 992         struct tcp_sock *tp = tcp_sk(sk);
 993         struct tcp_md5sig_key *key;
 994         struct hlist_node *n;
 995         struct tcp_md5sig_info *md5sig;
 996
 997         md5sig = rcu_dereference_protected(tp->md5sig_info, 1);
 998
 999         hlist_for_each_entry_safe(key, n, &md5sig->head, node) {
1000                 hlist_del_rcu(&key->node);
1001                 atomic_sub(sizeof(*key), &sk->sk_omem_alloc);
1002                 kfree_rcu(key, rcu);
1003         }
1004 }
1005
1006 static int tcp_v4_parse_md5_keys(struct sock *sk, char __user *optval,
1007                                  int optlen)
1008 {
1009         struct tcp_md5sig cmd;
1010         struct sockaddr_in *sin = (struct sockaddr_in *)&cmd.tcpm_addr;
1011
1012         if (optlen < sizeof(cmd))
1013                 return -EINVAL;
1014
1015         if (copy_from_user(&cmd, optval, sizeof(cmd)))
1016                 return -EFAULT;
1017
1018         if (sin->sin_family != AF_INET)
1019                 return -EINVAL;
1020
1021         if (!cmd.tcpm_keylen)
1022                 return tcp_md5_do_del(sk, (union tcp_md5_addr *)&sin->sin_addr.s_addr,
1023                                       AF_INET);
1024
1025         if (cmd.tcpm_keylen > TCP_MD5SIG_MAXKEYLEN)
1026                 return -EINVAL;
1027
1028         return tcp_md5_do_add(sk, (union tcp_md5_addr *)&sin->sin_addr.s_addr,
1029                               AF_INET, cmd.tcpm_key, cmd.tcpm_keylen,
1030                               GFP_KERNEL);
1031 }
1032
1033 static int tcp_v4_md5_hash_headers(struct tcp_md5sig_pool *hp,
1034                                    __be32 daddr, __be32 saddr,
1035                                    const struct tcphdr *th, int nbytes)
1036 {
1037         struct tcp4_pseudohdr *bp;
1038         struct scatterlist sg;
1039         struct tcphdr *_th;
1040
1041         bp = hp->scratch;
1042         bp->saddr = saddr;
1043         bp->daddr = daddr;
1044         bp->pad = 0;
1045         bp->protocol = IPPROTO_TCP;
1046         bp->len = cpu_to_be16(nbytes);
1047
1048         _th = (struct tcphdr *)(bp + 1);
1049         memcpy(_th, th, sizeof(*th));
1050         _th->check = 0;
1051
1052         sg_init_one(&sg, bp, sizeof(*bp) + sizeof(*th));
1053         ahash_request_set_crypt(hp->md5_req, &sg, NULL,
1054                                 sizeof(*bp) + sizeof(*th));
1055         return crypto_ahash_update(hp->md5_req);
1056 }
1057
1058 static int tcp_v4_md5_hash_hdr(char *md5_hash, const struct tcp_md5sig_key *key,
1059                                __be32 daddr, __be32 saddr, const struct tcphdr *th)
1060 {
1061         struct tcp_md5sig_pool *hp;
1062         struct ahash_request *req;
1063
1064         hp = tcp_get_md5sig_pool();
1065         if (!hp)
1066                 goto clear_hash_noput;
1067         req = hp->md5_req;
1068
1069         if (crypto_ahash_init(req))
1070                 goto clear_hash;
1071         if (tcp_v4_md5_hash_headers(hp, daddr, saddr, th, th->doff << 2))
1072                 goto clear_hash;
1073         if (tcp_md5_hash_key(hp, key))
1074                 goto clear_hash;
1075         ahash_request_set_crypt(req, NULL, md5_hash, 0);
1076         if (crypto_ahash_final(req))
1077                 goto clear_hash;
1078
1079         tcp_put_md5sig_pool();
1080         return 0;
1081
1082 clear_hash:
1083         tcp_put_md5sig_pool();
1084 clear_hash_noput:
1085         memset(md5_hash, 0, 16);
1086         return 1;
1087 }
1088
1089 int tcp_v4_md5_hash_skb(char *md5_hash, const struct tcp_md5sig_key *key,
1090                         const struct sock *sk,
1091                         const struct sk_buff *skb)
1092 {
1093         struct tcp_md5sig_pool *hp;
1094         struct ahash_request *req;
1095         const struct tcphdr *th = tcp_hdr(skb);
1096         __be32 saddr, daddr;
1097
1098         if (sk) { /* valid for establish/request sockets */
1099                 saddr = sk->sk_rcv_saddr;
1100                 daddr = sk->sk_daddr;
1101         } else {
1102                 const struct iphdr *iph = ip_hdr(skb);
1103                 saddr = iph->saddr;
1104                 daddr = iph->daddr;
1105         }
1106
1107         hp = tcp_get_md5sig_pool();
1108         if (!hp)
1109                 goto clear_hash_noput;
1110         req = hp->md5_req;
1111
1112         if (crypto_ahash_init(req))
1113                 goto clear_hash;
1114
1115         if (tcp_v4_md5_hash_headers(hp, daddr, saddr, th, skb->len))
1116                 goto clear_hash;
1117         if (tcp_md5_hash_skb_data(hp, skb, th->doff << 2))
1118                 goto clear_hash;
1119         if (tcp_md5_hash_key(hp, key))
1120                 goto clear_hash;
1121         ahash_request_set_crypt(req, NULL, md5_hash, 0);
1122         if (crypto_ahash_final(req))
1123                 goto clear_hash;
1124
1125         tcp_put_md5sig_pool();
1126         return 0;
1127
1128 clear_hash:
1129         tcp_put_md5sig_pool();
1130 clear_hash_noput:
1131         memset(md5_hash, 0, 16);
1132         return 1;
1133 }
1134 EXPORT_SYMBOL(tcp_v4_md5_hash_skb);
1135
1136 #endif
1137
1138 /* Called with rcu_read_lock() */
1139 static bool tcp_v4_inbound_md5_hash(const struct sock *sk,
1140                                     const struct sk_buff *skb)
1141 {
1142 #ifdef CONFIG_TCP_MD5SIG
1143         /*
1144          * This gets called for each TCP segment that arrives
1145          * so we want to be efficient.
1146          * We have 3 drop cases:
1147          * o No MD5 hash and one expected.
1148          * o MD5 hash and we're not expecting one.
1149          * o MD5 hash and its wrong.
1150          */
1151         const __u8 *hash_location = NULL;
1152         struct tcp_md5sig_key *hash_expected;
1153         const struct iphdr *iph = ip_hdr(skb);
1154         const struct tcphdr *th = tcp_hdr(skb);
1155         int genhash;
1156         unsigned char newhash[16];
1157
1158         hash_expected = tcp_md5_do_lookup(sk, (union tcp_md5_addr *)&iph->saddr,
1159                                           AF_INET);
1160         hash_location = tcp_parse_md5sig_option(th);
1161
1162         /* We've parsed the options - do we have a hash? */
1163         if (!hash_expected && !hash_location)
1164                 return false;
1165
1166         if (hash_expected && !hash_location) {
1167                 NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPMD5NOTFOUND);
1168                 return true;
1169         }
1170
1171         if (!hash_expected && hash_location) {
1172                 NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPMD5UNEXPECTED);
1173                 return true;
1174         }
1175
1176         /* Okay, so this is hash_expected and hash_location -
1177          * so we need to calculate the checksum.
1178          */
1179         genhash = tcp_v4_md5_hash_skb(newhash,
1180                                       hash_expected,
1181                                       NULL, skb);
1182
1183         if (genhash || memcmp(hash_location, newhash, 16) != 0) {
1184                 NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPMD5FAILURE);
1185                 net_info_ratelimited("MD5 Hash failed for (%pI4, %d)->(%pI4, %d)%s\n",
1186                                      &iph->saddr, ntohs(th->source),
1187                                      &iph->daddr, ntohs(th->dest),
1188                                      genhash ? " tcp_v4_calc_md5_hash failed"
1189                                      : "");
1190                 return true;
1191         }
1192         return false;
1193 #endif
1194         return false;
1195 }
1196
1197 static void tcp_v4_init_req(struct request_sock *req,
1198                             const struct sock *sk_listener,
1199                             struct sk_buff *skb)
1200 {
1201         struct inet_request_sock *ireq = inet_rsk(req);
1202
1203         sk_rcv_saddr_set(req_to_sk(req), ip_hdr(skb)->daddr);
1204         sk_daddr_set(req_to_sk(req), ip_hdr(skb)->saddr);
1205         ireq->opt = tcp_v4_save_options(skb);
1206 }
1207
1208 static struct dst_entry *tcp_v4_route_req(const struct sock *sk,
1209                                           struct flowi *fl,
1210                                           const struct request_sock *req,
1211                                           bool *strict)
1212 {
1213         struct dst_entry *dst = inet_csk_route_req(sk, &fl->u.ip4, req);
1214
1215         if (strict) {
1216                 if (fl->u.ip4.daddr == inet_rsk(req)->ir_rmt_addr)
1217                         *strict = true;
1218                 else
1219                         *strict = false;
1220         }
1221
1222         return dst;
1223 }
1224
1225 struct request_sock_ops tcp_request_sock_ops __read_mostly = {
1226         .family         =       PF_INET,
1227         .obj_size       =       sizeof(struct tcp_request_sock),
1228         .rtx_syn_ack    =       tcp_rtx_synack,
1229         .send_ack       =       tcp_v4_reqsk_send_ack,
1230         .destructor     =       tcp_v4_reqsk_destructor,
1231         .send_reset     =       tcp_v4_send_reset,
1232         .syn_ack_timeout =      tcp_syn_ack_timeout,
1233 };
1234
1235 static const struct tcp_request_sock_ops tcp_request_sock_ipv4_ops = {
1236         .mss_clamp      =       TCP_MSS_DEFAULT,
1237 #ifdef CONFIG_TCP_MD5SIG
1238         .req_md5_lookup =       tcp_v4_md5_lookup,
1239         .calc_md5_hash  =       tcp_v4_md5_hash_skb,
1240 #endif
1241         .init_req       =       tcp_v4_init_req,
1242 #ifdef CONFIG_SYN_COOKIES
1243         .cookie_init_seq =      cookie_v4_init_sequence,
1244 #endif
1245         .route_req      =       tcp_v4_route_req,
1246         .init_seq       =       tcp_v4_init_sequence,
1247         .send_synack    =       tcp_v4_send_synack,
1248 };
1249
1250 int tcp_v4_conn_request(struct sock *sk, struct sk_buff *skb)
1251 {
1252         /* Never answer to SYNs send to broadcast or multicast */
1253         if (skb_rtable(skb)->rt_flags & (RTCF_BROADCAST | RTCF_MULTICAST))
1254                 goto drop;
1255
1256         return tcp_conn_request(&tcp_request_sock_ops,
1257                                 &tcp_request_sock_ipv4_ops, sk, skb);
1258
1259 drop:
1260         tcp_listendrop(sk);
1261         return 0;
1262 }
1263 EXPORT_SYMBOL(tcp_v4_conn_request);
1264
1265
1266 /*
1267  * The three way handshake has completed - we got a valid synack -
1268  * now create the new socket.
1269  */
1270 struct sock *tcp_v4_syn_recv_sock(const struct sock *sk, struct sk_buff *skb,
1271                                   struct request_sock *req,
1272                                   struct dst_entry *dst,
1273                                   struct request_sock *req_unhash,
1274                                   bool *own_req)
1275 {
1276         struct inet_request_sock *ireq;
1277         struct inet_sock *newinet;
1278         struct tcp_sock *newtp;
1279         struct sock *newsk;
1280 #ifdef CONFIG_TCP_MD5SIG
1281         struct tcp_md5sig_key *key;
1282 #endif
1283         struct ip_options_rcu *inet_opt;
1284
1285         if (sk_acceptq_is_full(sk))
1286                 goto exit_overflow;
1287
1288         newsk = tcp_create_openreq_child(sk, req, skb);
1289         if (!newsk)
1290                 goto exit_nonewsk;
1291
1292         newsk->sk_gso_type = SKB_GSO_TCPV4;
1293         inet_sk_rx_dst_set(newsk, skb);
1294
1295         newtp                 = tcp_sk(newsk);
1296         newinet               = inet_sk(newsk);
1297         ireq                  = inet_rsk(req);
1298         sk_daddr_set(newsk, ireq->ir_rmt_addr);
1299         sk_rcv_saddr_set(newsk, ireq->ir_loc_addr);
1300         newsk->sk_bound_dev_if = ireq->ir_iif;
1301         newinet->inet_saddr           = ireq->ir_loc_addr;
1302         inet_opt              = ireq->opt;
1303         rcu_assign_pointer(newinet->inet_opt, inet_opt);
1304         ireq->opt             = NULL;
1305         newinet->mc_index     = inet_iif(skb);
1306         newinet->mc_ttl       = ip_hdr(skb)->ttl;
1307         newinet->rcv_tos      = ip_hdr(skb)->tos;
1308         inet_csk(newsk)->icsk_ext_hdr_len = 0;
1309         if (inet_opt)
1310                 inet_csk(newsk)->icsk_ext_hdr_len = inet_opt->opt.optlen;
1311         newinet->inet_id = newtp->write_seq ^ jiffies;
1312
1313         if (!dst) {
1314                 dst = inet_csk_route_child_sock(sk, newsk, req);
1315                 if (!dst)
1316                         goto put_and_exit;
1317         } else {
1318                 /* syncookie case : see end of cookie_v4_check() */
1319         }
1320         sk_setup_caps(newsk, dst);
1321
1322         tcp_ca_openreq_child(newsk, dst);
1323
1324         tcp_sync_mss(newsk, dst_mtu(dst));
1325         newtp->advmss = dst_metric_advmss(dst);
1326         if (tcp_sk(sk)->rx_opt.user_mss &&
1327             tcp_sk(sk)->rx_opt.user_mss < newtp->advmss)
1328                 newtp->advmss = tcp_sk(sk)->rx_opt.user_mss;
1329
1330         tcp_initialize_rcv_mss(newsk);
1331
1332 #ifdef CONFIG_TCP_MD5SIG
1333         /* Copy over the MD5 key from the original socket */
1334         key = tcp_md5_do_lookup(sk, (union tcp_md5_addr *)&newinet->inet_daddr,
1335                                 AF_INET);
1336         if (key) {
1337                 /*
1338                  * We're using one, so create a matching key
1339                  * on the newsk structure. If we fail to get
1340                  * memory, then we end up not copying the key
1341                  * across. Shucks.
1342                  */
1343                 tcp_md5_do_add(newsk, (union tcp_md5_addr *)&newinet->inet_daddr,
1344                                AF_INET, key->key, key->keylen, GFP_ATOMIC);
1345                 sk_nocaps_add(newsk, NETIF_F_GSO_MASK);
1346         }
1347 #endif
1348
1349         if (__inet_inherit_port(sk, newsk) < 0)
1350                 goto put_and_exit;
1351         *own_req = inet_ehash_nolisten(newsk, req_to_sk(req_unhash));
1352         if (*own_req)
1353                 tcp_move_syn(newtp, req);
1354
1355         return newsk;
1356
1357 exit_overflow:
1358         NET_INC_STATS(sock_net(sk), LINUX_MIB_LISTENOVERFLOWS);
1359 exit_nonewsk:
1360         dst_release(dst);
1361 exit:
1362         tcp_listendrop(sk);
1363         return NULL;
1364 put_and_exit:
1365         inet_csk_prepare_forced_close(newsk);
1366         tcp_done(newsk);
1367         goto exit;
1368 }
1369 EXPORT_SYMBOL(tcp_v4_syn_recv_sock);
1370
1371 static struct sock *tcp_v4_cookie_check(struct sock *sk, struct sk_buff *skb)
1372 {
1373 #ifdef CONFIG_SYN_COOKIES
1374         const struct tcphdr *th = tcp_hdr(skb);
1375
1376         if (!th->syn)
1377                 sk = cookie_v4_check(sk, skb);
1378 #endif
1379         return sk;
1380 }
1381
1382 /* The socket must have it's spinlock held when we get
1383  * here, unless it is a TCP_LISTEN socket.
1384  *
1385  * We have a potential double-lock case here, so even when
1386  * doing backlog processing we use the BH locking scheme.
1387  * This is because we cannot sleep with the original spinlock
1388  * held.
1389  */
1390 int tcp_v4_do_rcv(struct sock *sk, struct sk_buff *skb)
1391 {
1392         struct sock *rsk;
1393
1394         if (sk->sk_state == TCP_ESTABLISHED) { /* Fast path */
1395                 struct dst_entry *dst = sk->sk_rx_dst;
1396
1397                 sock_rps_save_rxhash(sk, skb);
1398                 sk_mark_napi_id(sk, skb);
1399                 if (dst) {
1400                         if (inet_sk(sk)->rx_dst_ifindex != skb->skb_iif ||
1401                             !dst->ops->check(dst, 0)) {
1402                                 dst_release(dst);
1403                                 sk->sk_rx_dst = NULL;
1404                         }
1405                 }
1406                 tcp_rcv_established(sk, skb, tcp_hdr(skb), skb->len);
1407                 return 0;
1408         }
1409
1410         if (tcp_checksum_complete(skb))
1411                 goto csum_err;
1412
1413         if (sk->sk_state == TCP_LISTEN) {
1414                 struct sock *nsk = tcp_v4_cookie_check(sk, skb);
1415
1416                 if (!nsk)
1417                         goto discard;
1418                 if (nsk != sk) {
1419                         sock_rps_save_rxhash(nsk, skb);
1420                         sk_mark_napi_id(nsk, skb);
1421                         if (tcp_child_process(sk, nsk, skb)) {
1422                                 rsk = nsk;
1423                                 goto reset;
1424                         }
1425                         return 0;
1426                 }
1427         } else
1428                 sock_rps_save_rxhash(sk, skb);
1429
1430         if (tcp_rcv_state_process(sk, skb)) {
1431                 rsk = sk;
1432                 goto reset;
1433         }
1434         return 0;
1435
1436 reset:
1437         tcp_v4_send_reset(rsk, skb);
1438 discard:
1439         kfree_skb(skb);
1440         /* Be careful here. If this function gets more complicated and
1441          * gcc suffers from register pressure on the x86, sk (in %ebx)
1442          * might be destroyed here. This current version compiles correctly,
1443          * but you have been warned.
1444          */
1445         return 0;
1446
1447 csum_err:
1448         TCP_INC_STATS(sock_net(sk), TCP_MIB_CSUMERRORS);
1449         TCP_INC_STATS(sock_net(sk), TCP_MIB_INERRS);
1450         goto discard;
1451 }
1452 EXPORT_SYMBOL(tcp_v4_do_rcv);
1453
1454 void tcp_v4_early_demux(struct sk_buff *skb)
1455 {
1456         const struct iphdr *iph;
1457         const struct tcphdr *th;
1458         struct sock *sk;
1459
1460         if (skb->pkt_type != PACKET_HOST)
1461                 return;
1462
1463         if (!pskb_may_pull(skb, skb_transport_offset(skb) + sizeof(struct tcphdr)))
1464                 return;
1465
1466         iph = ip_hdr(skb);
1467         th = tcp_hdr(skb);
1468
1469         if (th->doff < sizeof(struct tcphdr) / 4)
1470                 return;
1471
1472         sk = __inet_lookup_established(dev_net(skb->dev), &tcp_hashinfo,
1473                                        iph->saddr, th->source,
1474                                        iph->daddr, ntohs(th->dest),
1475                                        skb->skb_iif);
1476         if (sk) {
1477                 skb->sk = sk;
1478                 skb->destructor = sock_edemux;
1479                 if (sk_fullsock(sk)) {
1480                         struct dst_entry *dst = READ_ONCE(sk->sk_rx_dst);
1481
1482                         if (dst)
1483                                 dst = dst_check(dst, 0);
1484                         if (dst &&
1485                             inet_sk(sk)->rx_dst_ifindex == skb->skb_iif)
1486                                 skb_dst_set_noref(skb, dst);
1487                 }
1488         }
1489 }
1490
1491 /* Packet is added to VJ-style prequeue for processing in process
1492  * context, if a reader task is waiting. Apparently, this exciting
1493  * idea (VJ's mail "Re: query about TCP header on tcp-ip" of 07 Sep 93)
1494  * failed somewhere. Latency? Burstiness? Well, at least now we will
1495  * see, why it failed. 8)8)                               --ANK
1496  *
1497  */
1498 bool tcp_prequeue(struct sock *sk, struct sk_buff *skb)
1499 {
1500         struct tcp_sock *tp = tcp_sk(sk);
1501
1502         if (sysctl_tcp_low_latency || !tp->ucopy.task)
1503                 return false;
1504
1505         if (skb->len <= tcp_hdrlen(skb) &&
1506             skb_queue_len(&tp->ucopy.prequeue) == 0)
1507                 return false;
1508
1509         /* Before escaping RCU protected region, we need to take care of skb
1510          * dst. Prequeue is only enabled for established sockets.
1511          * For such sockets, we might need the skb dst only to set sk->sk_rx_dst
1512          * Instead of doing full sk_rx_dst validity here, let's perform
1513          * an optimistic check.
1514          */
1515         if (likely(sk->sk_rx_dst))
1516                 skb_dst_drop(skb);
1517         else
1518                 skb_dst_force_safe(skb);
1519
1520         __skb_queue_tail(&tp->ucopy.prequeue, skb);
1521         tp->ucopy.memory += skb->truesize;
1522         if (skb_queue_len(&tp->ucopy.prequeue) >= 32 ||
1523             tp->ucopy.memory + atomic_read(&sk->sk_rmem_alloc) > sk->sk_rcvbuf) {
1524                 struct sk_buff *skb1;
1525
1526                 BUG_ON(sock_owned_by_user(sk));
1527                 __NET_ADD_STATS(sock_net(sk), LINUX_MIB_TCPPREQUEUEDROPPED,
1528                                 skb_queue_len(&tp->ucopy.prequeue));
1529
1530                 while ((skb1 = __skb_dequeue(&tp->ucopy.prequeue)) != NULL)
1531                         sk_backlog_rcv(sk, skb1);
1532
1533                 tp->ucopy.memory = 0;
1534         } else if (skb_queue_len(&tp->ucopy.prequeue) == 1) {
1535                 wake_up_interruptible_sync_poll(sk_sleep(sk),
1536                                            POLLIN | POLLRDNORM | POLLRDBAND);
1537                 if (!inet_csk_ack_scheduled(sk))
1538                         inet_csk_reset_xmit_timer(sk, ICSK_TIME_DACK,
1539                                                   (3 * tcp_rto_min(sk)) / 4,
1540                                                   TCP_RTO_MAX);
1541         }
1542         return true;
1543 }
1544 EXPORT_SYMBOL(tcp_prequeue);
1545
1546 bool tcp_add_backlog(struct sock *sk, struct sk_buff *skb)
1547 {
1548         u32 limit = sk->sk_rcvbuf + sk->sk_sndbuf;
1549
1550         /* Only socket owner can try to collapse/prune rx queues
1551          * to reduce memory overhead, so add a little headroom here.
1552          * Few sockets backlog are possibly concurrently non empty.
1553          */
1554         limit += 64*1024;
1555
1556         /* In case all data was pulled from skb frags (in __pskb_pull_tail()),
1557          * we can fix skb->truesize to its real value to avoid future drops.
1558          * This is valid because skb is not yet charged to the socket.
1559          * It has been noticed pure SACK packets were sometimes dropped
1560          * (if cooked by drivers without copybreak feature).
1561          */
1562         if (!skb->data_len)
1563                 skb->truesize = SKB_TRUESIZE(skb_end_offset(skb));
1564
1565         if (unlikely(sk_add_backlog(sk, skb, limit))) {
1566                 bh_unlock_sock(sk);
1567                 __NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPBACKLOGDROP);
1568                 return true;
1569         }
1570         return false;
1571 }
1572 EXPORT_SYMBOL(tcp_add_backlog);
1573
1574 int tcp_filter(struct sock *sk, struct sk_buff *skb)
1575 {
1576         struct tcphdr *th = (struct tcphdr *)skb->data;
1577         unsigned int eaten = skb->len;
1578         int err;
1579
1580         err = sk_filter_trim_cap(sk, skb, th->doff * 4);
1581         if (!err) {
1582                 eaten -= skb->len;
1583                 TCP_SKB_CB(skb)->end_seq -= eaten;
1584         }
1585         return err;
1586 }
1587 EXPORT_SYMBOL(tcp_filter);
1588
1589 /*
1590  *      From tcp_input.c
1591  */
1592
1593 int tcp_v4_rcv(struct sk_buff *skb)
1594 {
1595         struct net *net = dev_net(skb->dev);
1596         const struct iphdr *iph;
1597         const struct tcphdr *th;
1598         bool refcounted;
1599         struct sock *sk;
1600         int ret;
1601
1602         if (skb->pkt_type != PACKET_HOST)
1603                 goto discard_it;
1604
1605         /* Count it even if it's bad */
1606         __TCP_INC_STATS(net, TCP_MIB_INSEGS);
1607
1608         if (!pskb_may_pull(skb, sizeof(struct tcphdr)))
1609                 goto discard_it;
1610
1611         th = (const struct tcphdr *)skb->data;
1612
1613         if (unlikely(th->doff < sizeof(struct tcphdr) / 4))
1614                 goto bad_packet;
1615         if (!pskb_may_pull(skb, th->doff * 4))
1616                 goto discard_it;
1617
1618         /* An explanation is required here, I think.
1619          * Packet length and doff are validated by header prediction,
1620          * provided case of th->doff==0 is eliminated.
1621          * So, we defer the checks. */
1622
1623         if (skb_checksum_init(skb, IPPROTO_TCP, inet_compute_pseudo))
1624                 goto csum_error;
1625
1626         th = (const struct tcphdr *)skb->data;
1627         iph = ip_hdr(skb);
1628         /* This is tricky : We move IPCB at its correct location into TCP_SKB_CB()
1629          * barrier() makes sure compiler wont play fool^Waliasing games.
1630          */
1631         memmove(&TCP_SKB_CB(skb)->header.h4, IPCB(skb),
1632                 sizeof(struct inet_skb_parm));
1633         barrier();
1634
1635         TCP_SKB_CB(skb)->seq = ntohl(th->seq);
1636         TCP_SKB_CB(skb)->end_seq = (TCP_SKB_CB(skb)->seq + th->syn + th->fin +
1637                                     skb->len - th->doff * 4);
1638         TCP_SKB_CB(skb)->ack_seq = ntohl(th->ack_seq);
1639         TCP_SKB_CB(skb)->tcp_flags = tcp_flag_byte(th);
1640         TCP_SKB_CB(skb)->tcp_tw_isn = 0;
1641         TCP_SKB_CB(skb)->ip_dsfield = ipv4_get_dsfield(iph);
1642         TCP_SKB_CB(skb)->sacked  = 0;
1643
1644 lookup:
1645         sk = __inet_lookup_skb(&tcp_hashinfo, skb, __tcp_hdrlen(th), th->source,
1646                                th->dest, &refcounted);
1647         if (!sk)
1648                 goto no_tcp_socket;
1649
1650 process:
1651         if (sk->sk_state == TCP_TIME_WAIT)
1652                 goto do_time_wait;
1653
1654         if (sk->sk_state == TCP_NEW_SYN_RECV) {
1655                 struct request_sock *req = inet_reqsk(sk);
1656                 struct sock *nsk;
1657
1658                 sk = req->rsk_listener;
1659                 if (unlikely(tcp_v4_inbound_md5_hash(sk, skb))) {
1660                         sk_drops_add(sk, skb);
1661                         reqsk_put(req);
1662                         goto discard_it;
1663                 }
1664                 if (unlikely(sk->sk_state != TCP_LISTEN)) {
1665                         inet_csk_reqsk_queue_drop_and_put(sk, req);
1666                         goto lookup;
1667                 }
1668                 /* We own a reference on the listener, increase it again
1669                  * as we might lose it too soon.
1670                  */
1671                 sock_hold(sk);
1672                 refcounted = true;
1673                 nsk = tcp_check_req(sk, skb, req, false);
1674                 if (!nsk) {
1675                         reqsk_put(req);
1676                         goto discard_and_relse;
1677                 }
1678                 if (nsk == sk) {
1679                         reqsk_put(req);
1680                 } else if (tcp_child_process(sk, nsk, skb)) {
1681                         tcp_v4_send_reset(nsk, skb);
1682                         goto discard_and_relse;
1683                 } else {
1684                         sock_put(sk);
1685                         return 0;
1686                 }
1687         }
1688         if (unlikely(iph->ttl < inet_sk(sk)->min_ttl)) {
1689                 __NET_INC_STATS(net, LINUX_MIB_TCPMINTTLDROP);
1690                 goto discard_and_relse;
1691         }
1692
1693         if (!xfrm4_policy_check(sk, XFRM_POLICY_IN, skb))
1694                 goto discard_and_relse;
1695
1696         if (tcp_v4_inbound_md5_hash(sk, skb))
1697                 goto discard_and_relse;
1698
1699         nf_reset(skb);
1700
1701         if (tcp_filter(sk, skb))
1702                 goto discard_and_relse;
1703         th = (const struct tcphdr *)skb->data;
1704         iph = ip_hdr(skb);
1705
1706         skb->dev = NULL;
1707
1708         if (sk->sk_state == TCP_LISTEN) {
1709                 ret = tcp_v4_do_rcv(sk, skb);
1710                 goto put_and_return;
1711         }
1712
1713         sk_incoming_cpu_update(sk);
1714
1715         bh_lock_sock_nested(sk);
1716         tcp_segs_in(tcp_sk(sk), skb);
1717         ret = 0;
1718         if (!sock_owned_by_user(sk)) {
1719                 if (!tcp_prequeue(sk, skb))
1720                         ret = tcp_v4_do_rcv(sk, skb);
1721         } else if (tcp_add_backlog(sk, skb)) {
1722                 goto discard_and_relse;
1723         }
1724         bh_unlock_sock(sk);
1725
1726 put_and_return:
1727         if (refcounted)
1728                 sock_put(sk);
1729
1730         return ret;
1731
1732 no_tcp_socket:
1733         if (!xfrm4_policy_check(NULL, XFRM_POLICY_IN, skb))
1734                 goto discard_it;
1735
1736         if (tcp_checksum_complete(skb)) {
1737 csum_error:
1738                 __TCP_INC_STATS(net, TCP_MIB_CSUMERRORS);
1739 bad_packet:
1740                 __TCP_INC_STATS(net, TCP_MIB_INERRS);
1741         } else {
1742                 tcp_v4_send_reset(NULL, skb);
1743         }
1744
1745 discard_it:
1746         /* Discard frame. */
1747         kfree_skb(skb);
1748         return 0;
1749
1750 discard_and_relse:
1751         sk_drops_add(sk, skb);
1752         if (refcounted)
1753                 sock_put(sk);
1754         goto discard_it;
1755
1756 do_time_wait:
1757         if (!xfrm4_policy_check(NULL, XFRM_POLICY_IN, skb)) {
1758                 inet_twsk_put(inet_twsk(sk));
1759                 goto discard_it;
1760         }
1761
1762         if (tcp_checksum_complete(skb)) {
1763                 inet_twsk_put(inet_twsk(sk));
1764                 goto csum_error;
1765         }
1766         switch (tcp_timewait_state_process(inet_twsk(sk), skb, th)) {
1767         case TCP_TW_SYN: {
1768                 struct sock *sk2 = inet_lookup_listener(dev_net(skb->dev),
1769                                                         &tcp_hashinfo, skb,
1770                                                         __tcp_hdrlen(th),
1771                                                         iph->saddr, th->source,
1772                                                         iph->daddr, th->dest,
1773                                                         inet_iif(skb));
1774                 if (sk2) {
1775                         inet_twsk_deschedule_put(inet_twsk(sk));
1776                         sk = sk2;
1777                         refcounted = false;
1778                         goto process;
1779                 }
1780                 /* Fall through to ACK */
1781         }
1782         case TCP_TW_ACK:
1783                 tcp_v4_timewait_ack(sk, skb);
1784                 break;
1785         case TCP_TW_RST:
1786                 tcp_v4_send_reset(sk, skb);
1787                 inet_twsk_deschedule_put(inet_twsk(sk));
1788                 goto discard_it;
1789         case TCP_TW_SUCCESS:;
1790         }
1791         goto discard_it;
1792 }
1793
1794 static struct timewait_sock_ops tcp_timewait_sock_ops = {
1795         .twsk_obj_size  = sizeof(struct tcp_timewait_sock),
1796         .twsk_unique    = tcp_twsk_unique,
1797         .twsk_destructor= tcp_twsk_destructor,
1798 };
1799
1800 void inet_sk_rx_dst_set(struct sock *sk, const struct sk_buff *skb)
1801 {
1802         struct dst_entry *dst = skb_dst(skb);
1803
1804         if (dst && dst_hold_safe(dst)) {
1805                 sk->sk_rx_dst = dst;
1806                 inet_sk(sk)->rx_dst_ifindex = skb->skb_iif;
1807         }
1808 }
1809 EXPORT_SYMBOL(inet_sk_rx_dst_set);
1810
1811 const struct inet_connection_sock_af_ops ipv4_specific = {
1812         .queue_xmit        = ip_queue_xmit,
1813         .send_check        = tcp_v4_send_check,
1814         .rebuild_header    = inet_sk_rebuild_header,
1815         .sk_rx_dst_set     = inet_sk_rx_dst_set,
1816         .conn_request      = tcp_v4_conn_request,
1817         .syn_recv_sock     = tcp_v4_syn_recv_sock,
1818         .net_header_len    = sizeof(struct iphdr),
1819         .setsockopt        = ip_setsockopt,
1820         .getsockopt        = ip_getsockopt,
1821         .addr2sockaddr     = inet_csk_addr2sockaddr,
1822         .sockaddr_len      = sizeof(struct sockaddr_in),
1823         .bind_conflict     = inet_csk_bind_conflict,
1824 #ifdef CONFIG_COMPAT
1825         .compat_setsockopt = compat_ip_setsockopt,
1826         .compat_getsockopt = compat_ip_getsockopt,
1827 #endif
1828         .mtu_reduced       = tcp_v4_mtu_reduced,
1829 };
1830 EXPORT_SYMBOL(ipv4_specific);
1831
1832 #ifdef CONFIG_TCP_MD5SIG
1833 static const struct tcp_sock_af_ops tcp_sock_ipv4_specific = {
1834         .md5_lookup             = tcp_v4_md5_lookup,
1835         .calc_md5_hash          = tcp_v4_md5_hash_skb,
1836         .md5_parse              = tcp_v4_parse_md5_keys,
1837 };
1838 #endif
1839
1840 /* NOTE: A lot of things set to zero explicitly by call to
1841  *       sk_alloc() so need not be done here.
1842  */
1843 static int tcp_v4_init_sock(struct sock *sk)
1844 {
1845         struct inet_connection_sock *icsk = inet_csk(sk);
1846
1847         tcp_init_sock(sk);
1848
1849         icsk->icsk_af_ops = &ipv4_specific;
1850
1851 #ifdef CONFIG_TCP_MD5SIG
1852         tcp_sk(sk)->af_specific = &tcp_sock_ipv4_specific;
1853 #endif
1854
1855         return 0;
1856 }
1857
1858 void tcp_v4_destroy_sock(struct sock *sk)
1859 {
1860         struct tcp_sock *tp = tcp_sk(sk);
1861
1862         tcp_clear_xmit_timers(sk);
1863
1864         tcp_cleanup_congestion_control(sk);
1865
1866         /* Cleanup up the write buffer. */
1867         tcp_write_queue_purge(sk);
1868
1869         /* Cleans up our, hopefully empty, out_of_order_queue. */
1870         skb_rbtree_purge(&tp->out_of_order_queue);
1871
1872 #ifdef CONFIG_TCP_MD5SIG
1873         /* Clean up the MD5 key list, if any */
1874         if (tp->md5sig_info) {
1875                 tcp_clear_md5_list(sk);
1876                 kfree_rcu(tp->md5sig_info, rcu);
1877                 tp->md5sig_info = NULL;
1878         }
1879 #endif
1880
1881         /* Clean prequeue, it must be empty really */
1882         __skb_queue_purge(&tp->ucopy.prequeue);
1883
1884         /* Clean up a referenced TCP bind bucket. */
1885         if (inet_csk(sk)->icsk_bind_hash)
1886                 inet_put_port(sk);
1887
1888         BUG_ON(tp->fastopen_rsk);
1889
1890         /* If socket is aborted during connect operation */
1891         tcp_free_fastopen_req(tp);
1892         tcp_saved_syn_free(tp);
1893
1894         local_bh_disable();
1895         sk_sockets_allocated_dec(sk);
1896         local_bh_enable();
1897 }
1898 EXPORT_SYMBOL(tcp_v4_destroy_sock);
1899
1900 #ifdef CONFIG_PROC_FS
1901 /* Proc filesystem TCP sock list dumping. */
1902
1903 /*
1904  * Get next listener socket follow cur.  If cur is NULL, get first socket
1905  * starting from bucket given in st->bucket; when st->bucket is zero the
1906  * very first socket in the hash table is returned.
1907  */
1908 static void *listening_get_next(struct seq_file *seq, void *cur)
1909 {
1910         struct tcp_iter_state *st = seq->private;
1911         struct net *net = seq_file_net(seq);
1912         struct inet_listen_hashbucket *ilb;
1913         struct sock *sk = cur;
1914
1915         if (!sk) {
1916 get_head:
1917                 ilb = &tcp_hashinfo.listening_hash[st->bucket];
1918                 spin_lock_bh(&ilb->lock);
1919                 sk = sk_head(&ilb->head);
1920                 st->offset = 0;
1921                 goto get_sk;
1922         }
1923         ilb = &tcp_hashinfo.listening_hash[st->bucket];
1924         ++st->num;
1925         ++st->offset;
1926
1927         sk = sk_next(sk);
1928 get_sk:
1929         sk_for_each_from(sk) {
1930                 if (!net_eq(sock_net(sk), net))
1931                         continue;
1932                 if (sk->sk_family == st->family)
1933                         return sk;
1934         }
1935         spin_unlock_bh(&ilb->lock);
1936         st->offset = 0;
1937         if (++st->bucket < INET_LHTABLE_SIZE)
1938                 goto get_head;
1939         return NULL;
1940 }
1941
1942 static void *listening_get_idx(struct seq_file *seq, loff_t *pos)
1943 {
1944         struct tcp_iter_state *st = seq->private;
1945         void *rc;
1946
1947         st->bucket = 0;
1948         st->offset = 0;
1949         rc = listening_get_next(seq, NULL);
1950
1951         while (rc && *pos) {
1952                 rc = listening_get_next(seq, rc);
1953                 --*pos;
1954         }
1955         return rc;
1956 }
1957
1958 static inline bool empty_bucket(const struct tcp_iter_state *st)
1959 {
1960         return hlist_nulls_empty(&tcp_hashinfo.ehash[st->bucket].chain);
1961 }
1962
1963 /*
1964  * Get first established socket starting from bucket given in st->bucket.
1965  * If st->bucket is zero, the very first socket in the hash is returned.
1966  */
1967 static void *established_get_first(struct seq_file *seq)
1968 {
1969         struct tcp_iter_state *st = seq->private;
1970         struct net *net = seq_file_net(seq);
1971         void *rc = NULL;
1972
1973         st->offset = 0;
1974         for (; st->bucket <= tcp_hashinfo.ehash_mask; ++st->bucket) {
1975                 struct sock *sk;
1976                 struct hlist_nulls_node *node;
1977                 spinlock_t *lock = inet_ehash_lockp(&tcp_hashinfo, st->bucket);
1978
1979                 /* Lockless fast path for the common case of empty buckets */
1980                 if (empty_bucket(st))
1981                         continue;
1982
1983                 spin_lock_bh(lock);
1984                 sk_nulls_for_each(sk, node, &tcp_hashinfo.ehash[st->bucket].chain) {
1985                         if (sk->sk_family != st->family ||
1986                             !net_eq(sock_net(sk), net)) {
1987                                 continue;
1988                         }
1989                         rc = sk;
1990                         goto out;
1991                 }
1992                 spin_unlock_bh(lock);
1993         }
1994 out:
1995         return rc;
1996 }
1997
1998 static void *established_get_next(struct seq_file *seq, void *cur)
1999 {
2000         struct sock *sk = cur;
2001         struct hlist_nulls_node *node;
2002         struct tcp_iter_state *st = seq->private;
2003         struct net *net = seq_file_net(seq);
2004
2005         ++st->num;
2006         ++st->offset;
2007
2008         sk = sk_nulls_next(sk);
2009
2010         sk_nulls_for_each_from(sk, node) {
2011                 if (sk->sk_family == st->family && net_eq(sock_net(sk), net))
2012                         return sk;
2013         }
2014
2015         spin_unlock_bh(inet_ehash_lockp(&tcp_hashinfo, st->bucket));
2016         ++st->bucket;
2017         return established_get_first(seq);
2018 }
2019
2020 static void *established_get_idx(struct seq_file *seq, loff_t pos)
2021 {
2022         struct tcp_iter_state *st = seq->private;
2023         void *rc;
2024
2025         st->bucket = 0;
2026         rc = established_get_first(seq);
2027
2028         while (rc && pos) {
2029                 rc = established_get_next(seq, rc);
2030                 --pos;
2031         }
2032         return rc;
2033 }
2034
2035 static void *tcp_get_idx(struct seq_file *seq, loff_t pos)
2036 {
2037         void *rc;
2038         struct tcp_iter_state *st = seq->private;
2039
2040         st->state = TCP_SEQ_STATE_LISTENING;
2041         rc        = listening_get_idx(seq, &pos);
2042
2043         if (!rc) {
2044                 st->state = TCP_SEQ_STATE_ESTABLISHED;
2045                 rc        = established_get_idx(seq, pos);
2046         }
2047
2048         return rc;
2049 }
2050
2051 static void *tcp_seek_last_pos(struct seq_file *seq)
2052 {
2053         struct tcp_iter_state *st = seq->private;
2054         int offset = st->offset;
2055         int orig_num = st->num;
2056         void *rc = NULL;
2057
2058         switch (st->state) {
2059         case TCP_SEQ_STATE_LISTENING:
2060                 if (st->bucket >= INET_LHTABLE_SIZE)
2061                         break;
2062                 st->state = TCP_SEQ_STATE_LISTENING;
2063                 rc = listening_get_next(seq, NULL);
2064                 while (offset-- && rc)
2065                         rc = listening_get_next(seq, rc);
2066                 if (rc)
2067                         break;
2068                 st->bucket = 0;
2069                 st->state = TCP_SEQ_STATE_ESTABLISHED;
2070                 /* Fallthrough */
2071         case TCP_SEQ_STATE_ESTABLISHED:
2072                 if (st->bucket > tcp_hashinfo.ehash_mask)
2073                         break;
2074                 rc = established_get_first(seq);
2075                 while (offset-- && rc)
2076                         rc = established_get_next(seq, rc);
2077         }
2078
2079         st->num = orig_num;
2080
2081         return rc;
2082 }
2083
2084 static void *tcp_seq_start(struct seq_file *seq, loff_t *pos)
2085 {
2086         struct tcp_iter_state *st = seq->private;
2087         void *rc;
2088
2089         if (*pos && *pos == st->last_pos) {
2090                 rc = tcp_seek_last_pos(seq);
2091                 if (rc)
2092                         goto out;
2093         }
2094
2095         st->state = TCP_SEQ_STATE_LISTENING;
2096         st->num = 0;
2097         st->bucket = 0;
2098         st->offset = 0;
2099         rc = *pos ? tcp_get_idx(seq, *pos - 1) : SEQ_START_TOKEN;
2100
2101 out:
2102         st->last_pos = *pos;
2103         return rc;
2104 }
2105
2106 static void *tcp_seq_next(struct seq_file *seq, void *v, loff_t *pos)
2107 {
2108         struct tcp_iter_state *st = seq->private;
2109         void *rc = NULL;
2110
2111         if (v == SEQ_START_TOKEN) {
2112                 rc = tcp_get_idx(seq, 0);
2113                 goto out;
2114         }
2115
2116         switch (st->state) {
2117         case TCP_SEQ_STATE_LISTENING:
2118                 rc = listening_get_next(seq, v);
2119                 if (!rc) {
2120                         st->state = TCP_SEQ_STATE_ESTABLISHED;
2121                         st->bucket = 0;
2122                         st->offset = 0;
2123                         rc        = established_get_first(seq);
2124                 }
2125                 break;
2126         case TCP_SEQ_STATE_ESTABLISHED:
2127                 rc = established_get_next(seq, v);
2128                 break;
2129         }
2130 out:
2131         ++*pos;
2132         st->last_pos = *pos;
2133         return rc;
2134 }
2135
2136 static void tcp_seq_stop(struct seq_file *seq, void *v)
2137 {
2138         struct tcp_iter_state *st = seq->private;
2139
2140         switch (st->state) {
2141         case TCP_SEQ_STATE_LISTENING:
2142                 if (v != SEQ_START_TOKEN)
2143                         spin_unlock_bh(&tcp_hashinfo.listening_hash[st->bucket].lock);
2144                 break;
2145         case TCP_SEQ_STATE_ESTABLISHED:
2146                 if (v)
2147                         spin_unlock_bh(inet_ehash_lockp(&tcp_hashinfo, st->bucket));
2148                 break;
2149         }
2150 }
2151
2152 int tcp_seq_open(struct inode *inode, struct file *file)
2153 {
2154         struct tcp_seq_afinfo *afinfo = PDE_DATA(inode);
2155         struct tcp_iter_state *s;
2156         int err;
2157
2158         err = seq_open_net(inode, file, &afinfo->seq_ops,
2159                           sizeof(struct tcp_iter_state));
2160         if (err < 0)
2161                 return err;
2162
2163         s = ((struct seq_file *)file->private_data)->private;
2164         s->family               = afinfo->family;
2165         s->last_pos             = 0;
2166         return 0;
2167 }
2168 EXPORT_SYMBOL(tcp_seq_open);
2169
2170 int tcp_proc_register(struct net *net, struct tcp_seq_afinfo *afinfo)
2171 {
2172         int rc = 0;
2173         struct proc_dir_entry *p;
2174
2175         afinfo->seq_ops.start           = tcp_seq_start;
2176         afinfo->seq_ops.next            = tcp_seq_next;
2177         afinfo->seq_ops.stop            = tcp_seq_stop;
2178
2179         p = proc_create_data(afinfo->name, S_IRUGO, net->proc_net,
2180                              afinfo->seq_fops, afinfo);
2181         if (!p)
2182                 rc = -ENOMEM;
2183         return rc;
2184 }
2185 EXPORT_SYMBOL(tcp_proc_register);
2186
2187 void tcp_proc_unregister(struct net *net, struct tcp_seq_afinfo *afinfo)
2188 {
2189         remove_proc_entry(afinfo->name, net->proc_net);
2190 }
2191 EXPORT_SYMBOL(tcp_proc_unregister);
2192
2193 static void get_openreq4(const struct request_sock *req,
2194                          struct seq_file *f, int i)
2195 {
2196         const struct inet_request_sock *ireq = inet_rsk(req);
2197         long delta = req->rsk_timer.expires - jiffies;
2198
2199         seq_printf(f, "%4d: %08X:%04X %08X:%04X"
2200                 " %02X %08X:%08X %02X:%08lX %08X %5u %8d %u %d %pK",
2201                 i,
2202                 ireq->ir_loc_addr,
2203                 ireq->ir_num,
2204                 ireq->ir_rmt_addr,
2205                 ntohs(ireq->ir_rmt_port),
2206                 TCP_SYN_RECV,
2207                 0, 0, /* could print option size, but that is af dependent. */
2208                 1,    /* timers active (only the expire timer) */
2209                 jiffies_delta_to_clock_t(delta),
2210                 req->num_timeout,
2211                 from_kuid_munged(seq_user_ns(f),
2212                                  sock_i_uid(req->rsk_listener)),
2213                 0,  /* non standard timer */
2214                 0, /* open_requests have no inode */
2215                 0,
2216                 req);
2217 }
2218
2219 static void get_tcp4_sock(struct sock *sk, struct seq_file *f, int i)
2220 {
2221         int timer_active;
2222         unsigned long timer_expires;
2223         const struct tcp_sock *tp = tcp_sk(sk);
2224         const struct inet_connection_sock *icsk = inet_csk(sk);
2225         const struct inet_sock *inet = inet_sk(sk);
2226         const struct fastopen_queue *fastopenq = &icsk->icsk_accept_queue.fastopenq;
2227         __be32 dest = inet->inet_daddr;
2228         __be32 src = inet->inet_rcv_saddr;
2229         __u16 destp = ntohs(inet->inet_dport);
2230         __u16 srcp = ntohs(inet->inet_sport);
2231         int rx_queue;
2232         int state;
2233
2234         if (icsk->icsk_pending == ICSK_TIME_RETRANS ||
2235             icsk->icsk_pending == ICSK_TIME_EARLY_RETRANS ||
2236             icsk->icsk_pending == ICSK_TIME_LOSS_PROBE) {
2237                 timer_active    = 1;
2238                 timer_expires   = icsk->icsk_timeout;
2239         } else if (icsk->icsk_pending == ICSK_TIME_PROBE0) {
2240                 timer_active    = 4;
2241                 timer_expires   = icsk->icsk_timeout;
2242         } else if (timer_pending(&sk->sk_timer)) {
2243                 timer_active    = 2;
2244                 timer_expires   = sk->sk_timer.expires;
2245         } else {
2246                 timer_active    = 0;
2247                 timer_expires = jiffies;
2248         }
2249
2250         state = sk_state_load(sk);
2251         if (state == TCP_LISTEN)
2252                 rx_queue = sk->sk_ack_backlog;
2253         else
2254                 /* Because we don't lock the socket,
2255                  * we might find a transient negative value.
2256                  */
2257                 rx_queue = max_t(int, tp->rcv_nxt - tp->copied_seq, 0);
2258
2259         seq_printf(f, "%4d: %08X:%04X %08X:%04X %02X %08X:%08X %02X:%08lX "
2260                         "%08X %5u %8d %lu %d %pK %lu %lu %u %u %d",
2261                 i, src, srcp, dest, destp, state,
2262                 tp->write_seq - tp->snd_una,
2263                 rx_queue,
2264                 timer_active,
2265                 jiffies_delta_to_clock_t(timer_expires - jiffies),
2266                 icsk->icsk_retransmits,
2267                 from_kuid_munged(seq_user_ns(f), sock_i_uid(sk)),
2268                 icsk->icsk_probes_out,
2269                 sock_i_ino(sk),
2270                 atomic_read(&sk->sk_refcnt), sk,
2271                 jiffies_to_clock_t(icsk->icsk_rto),
2272                 jiffies_to_clock_t(icsk->icsk_ack.ato),
2273                 (icsk->icsk_ack.quick << 1) | icsk->icsk_ack.pingpong,
2274                 tp->snd_cwnd,
2275                 state == TCP_LISTEN ?
2276                     fastopenq->max_qlen :
2277                     (tcp_in_initial_slowstart(tp) ? -1 : tp->snd_ssthresh));
2278 }
2279
2280 static void get_timewait4_sock(const struct inet_timewait_sock *tw,
2281                                struct seq_file *f, int i)
2282 {
2283         long delta = tw->tw_timer.expires - jiffies;
2284         __be32 dest, src;
2285         __u16 destp, srcp;
2286
2287         dest  = tw->tw_daddr;
2288         src   = tw->tw_rcv_saddr;
2289         destp = ntohs(tw->tw_dport);
2290         srcp  = ntohs(tw->tw_sport);
2291
2292         seq_printf(f, "%4d: %08X:%04X %08X:%04X"
2293                 " %02X %08X:%08X %02X:%08lX %08X %5d %8d %d %d %pK",
2294                 i, src, srcp, dest, destp, tw->tw_substate, 0, 0,
2295                 3, jiffies_delta_to_clock_t(delta), 0, 0, 0, 0,
2296                 atomic_read(&tw->tw_refcnt), tw);
2297 }
2298
2299 #define TMPSZ 150
2300
2301 static int tcp4_seq_show(struct seq_file *seq, void *v)
2302 {
2303         struct tcp_iter_state *st;
2304         struct sock *sk = v;
2305
2306         seq_setwidth(seq, TMPSZ - 1);
2307         if (v == SEQ_START_TOKEN) {
2308                 seq_puts(seq, "  sl  local_address rem_address   st tx_queue "
2309                            "rx_queue tr tm->when retrnsmt   uid  timeout "
2310                            "inode");
2311                 goto out;
2312         }
2313         st = seq->private;
2314
2315         if (sk->sk_state == TCP_TIME_WAIT)
2316                 get_timewait4_sock(v, seq, st->num);
2317         else if (sk->sk_state == TCP_NEW_SYN_RECV)
2318                 get_openreq4(v, seq, st->num);
2319         else
2320                 get_tcp4_sock(v, seq, st->num);
2321 out:
2322         seq_pad(seq, '\n');
2323         return 0;
2324 }
2325
2326 static const struct file_operations tcp_afinfo_seq_fops = {
2327         .owner   = THIS_MODULE,
2328         .open    = tcp_seq_open,
2329         .read    = seq_read,
2330         .llseek  = seq_lseek,
2331         .release = seq_release_net
2332 };
2333
2334 static struct tcp_seq_afinfo tcp4_seq_afinfo = {
2335         .name           = "tcp",
2336         .family         = AF_INET,
2337         .seq_fops       = &tcp_afinfo_seq_fops,
2338         .seq_ops        = {
2339                 .show           = tcp4_seq_show,
2340         },
2341 };
2342
2343 static int __net_init tcp4_proc_init_net(struct net *net)
2344 {
2345         return tcp_proc_register(net, &tcp4_seq_afinfo);
2346 }
2347
2348 static void __net_exit tcp4_proc_exit_net(struct net *net)
2349 {
2350         tcp_proc_unregister(net, &tcp4_seq_afinfo);
2351 }
2352
2353 static struct pernet_operations tcp4_net_ops = {
2354         .init = tcp4_proc_init_net,
2355         .exit = tcp4_proc_exit_net,
2356 };
2357
2358 int __init tcp4_proc_init(void)
2359 {
2360         return register_pernet_subsys(&tcp4_net_ops);
2361 }
2362
2363 void tcp4_proc_exit(void)
2364 {
2365         unregister_pernet_subsys(&tcp4_net_ops);
2366 }
2367 #endif /* CONFIG_PROC_FS */
2368
2369 struct proto tcp_prot = {
2370         .name                   = "TCP",
2371         .owner                  = THIS_MODULE,
2372         .close                  = tcp_close,
2373         .connect                = tcp_v4_connect,
2374         .disconnect             = tcp_disconnect,
2375         .accept                 = inet_csk_accept,
2376         .ioctl                  = tcp_ioctl,
2377         .init                   = tcp_v4_init_sock,
2378         .destroy                = tcp_v4_destroy_sock,
2379         .shutdown               = tcp_shutdown,
2380         .setsockopt             = tcp_setsockopt,
2381         .getsockopt             = tcp_getsockopt,
2382         .recvmsg                = tcp_recvmsg,
2383         .sendmsg                = tcp_sendmsg,
2384         .sendpage               = tcp_sendpage,
2385         .backlog_rcv            = tcp_v4_do_rcv,
2386         .release_cb             = tcp_release_cb,
2387         .hash                   = inet_hash,
2388         .unhash                 = inet_unhash,
2389         .get_port               = inet_csk_get_port,
2390         .enter_memory_pressure  = tcp_enter_memory_pressure,
2391         .stream_memory_free     = tcp_stream_memory_free,
2392         .sockets_allocated      = &tcp_sockets_allocated,
2393         .orphan_count           = &tcp_orphan_count,
2394         .memory_allocated       = &tcp_memory_allocated,
2395         .memory_pressure        = &tcp_memory_pressure,
2396         .sysctl_mem             = sysctl_tcp_mem,
2397         .sysctl_wmem            = sysctl_tcp_wmem,
2398         .sysctl_rmem            = sysctl_tcp_rmem,
2399         .max_header             = MAX_TCP_HEADER,
2400         .obj_size               = sizeof(struct tcp_sock),
2401         .slab_flags             = SLAB_DESTROY_BY_RCU,
2402         .twsk_prot              = &tcp_timewait_sock_ops,
2403         .rsk_prot               = &tcp_request_sock_ops,
2404         .h.hashinfo             = &tcp_hashinfo,
2405         .no_autobind            = true,
2406 #ifdef CONFIG_COMPAT
2407         .compat_setsockopt      = compat_tcp_setsockopt,
2408         .compat_getsockopt      = compat_tcp_getsockopt,
2409 #endif
2410         .diag_destroy           = tcp_abort,
2411 };
2412 EXPORT_SYMBOL(tcp_prot);
2413
2414 static void __net_exit tcp_sk_exit(struct net *net)
2415 {
2416         int cpu;
2417
2418         for_each_possible_cpu(cpu)
2419                 inet_ctl_sock_destroy(*per_cpu_ptr(net->ipv4.tcp_sk, cpu));
2420         free_percpu(net->ipv4.tcp_sk);
2421 }
2422
2423 static int __net_init tcp_sk_init(struct net *net)
2424 {
2425         int res, cpu;
2426
2427         net->ipv4.tcp_sk = alloc_percpu(struct sock *);
2428         if (!net->ipv4.tcp_sk)
2429                 return -ENOMEM;
2430
2431         for_each_possible_cpu(cpu) {
2432                 struct sock *sk;
2433
2434                 res = inet_ctl_sock_create(&sk, PF_INET, SOCK_RAW,
2435                                            IPPROTO_TCP, net);
2436                 if (res)
2437                         goto fail;
2438                 sock_set_flag(sk, SOCK_USE_WRITE_QUEUE);
2439                 *per_cpu_ptr(net->ipv4.tcp_sk, cpu) = sk;
2440         }
2441
2442         net->ipv4.sysctl_tcp_ecn = 2;
2443         net->ipv4.sysctl_tcp_ecn_fallback = 1;
2444
2445         net->ipv4.sysctl_tcp_base_mss = TCP_BASE_MSS;
2446         net->ipv4.sysctl_tcp_probe_threshold = TCP_PROBE_THRESHOLD;
2447         net->ipv4.sysctl_tcp_probe_interval = TCP_PROBE_INTERVAL;
2448
2449         net->ipv4.sysctl_tcp_keepalive_time = TCP_KEEPALIVE_TIME;
2450         net->ipv4.sysctl_tcp_keepalive_probes = TCP_KEEPALIVE_PROBES;
2451         net->ipv4.sysctl_tcp_keepalive_intvl = TCP_KEEPALIVE_INTVL;
2452
2453         net->ipv4.sysctl_tcp_syn_retries = TCP_SYN_RETRIES;
2454         net->ipv4.sysctl_tcp_synack_retries = TCP_SYNACK_RETRIES;
2455         net->ipv4.sysctl_tcp_syncookies = 1;
2456         net->ipv4.sysctl_tcp_reordering = TCP_FASTRETRANS_THRESH;
2457         net->ipv4.sysctl_tcp_retries1 = TCP_RETR1;
2458         net->ipv4.sysctl_tcp_retries2 = TCP_RETR2;
2459         net->ipv4.sysctl_tcp_orphan_retries = 0;
2460         net->ipv4.sysctl_tcp_fin_timeout = TCP_FIN_TIMEOUT;
2461         net->ipv4.sysctl_tcp_notsent_lowat = UINT_MAX;
2462
2463         return 0;
2464 fail:
2465         tcp_sk_exit(net);
2466
2467         return res;
2468 }
2469
2470 static void __net_exit tcp_sk_exit_batch(struct list_head *net_exit_list)
2471 {
2472         inet_twsk_purge(&tcp_hashinfo, &tcp_death_row, AF_INET);
2473 }
2474
2475 static struct pernet_operations __net_initdata tcp_sk_ops = {
2476        .init       = tcp_sk_init,
2477        .exit       = tcp_sk_exit,
2478        .exit_batch = tcp_sk_exit_batch,
2479 };
2480
2481 void __init tcp_v4_init(void)
2482 {
2483         inet_hashinfo_init(&tcp_hashinfo);
2484         if (register_pernet_subsys(&tcp_sk_ops))
2485                 panic("Failed to create the TCP control socket.\n");
2486 }