err.no Git - linux-2.6/blob - net/ipv4/ip_output.c

   1 /*
   2  * INET         An implementation of the TCP/IP protocol suite for the LINUX
   3  *              operating system.  INET is implemented using the  BSD Socket
   4  *              interface as the means of communication with the user level.
   5  *
   6  *              The Internet Protocol (IP) output module.
   7  *
   8  * Version:     $Id: ip_output.c,v 1.100 2002/02/01 22:01:03 davem Exp $
   9  *
  10  * Authors:     Ross Biro
  11  *              Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
  12  *              Donald Becker, <becker@super.org>
  13  *              Alan Cox, <Alan.Cox@linux.org>
  14  *              Richard Underwood
  15  *              Stefan Becker, <stefanb@yello.ping.de>
  16  *              Jorge Cwik, <jorge@laser.satlink.net>
  17  *              Arnt Gulbrandsen, <agulbra@nvg.unit.no>
  18  *              Hirokazu Takahashi, <taka@valinux.co.jp>
  19  *
  20  *      See ip_input.c for original log
  21  *
  22  *      Fixes:
  23  *              Alan Cox        :       Missing nonblock feature in ip_build_xmit.
  24  *              Mike Kilburn    :       htons() missing in ip_build_xmit.
  25  *              Bradford Johnson:       Fix faulty handling of some frames when
  26  *                                      no route is found.
  27  *              Alexander Demenshin:    Missing sk/skb free in ip_queue_xmit
  28  *                                      (in case if packet not accepted by
  29  *                                      output firewall rules)
  30  *              Mike McLagan    :       Routing by source
  31  *              Alexey Kuznetsov:       use new route cache
  32  *              Andi Kleen:             Fix broken PMTU recovery and remove
  33  *                                      some redundant tests.
  34  *      Vitaly E. Lavrov        :       Transparent proxy revived after year coma.
  35  *              Andi Kleen      :       Replace ip_reply with ip_send_reply.
  36  *              Andi Kleen      :       Split fast and slow ip_build_xmit path
  37  *                                      for decreased register pressure on x86
  38  *                                      and more readibility.
  39  *              Marc Boucher    :       When call_out_firewall returns FW_QUEUE,
  40  *                                      silently drop skb instead of failing with -EPERM.
  41  *              Detlev Wengorz  :       Copy protocol for fragments.
  42  *              Hirokazu Takahashi:     HW checksumming for outgoing UDP
  43  *                                      datagrams.
  44  *              Hirokazu Takahashi:     sendfile() on UDP works now.
  45  */
  46
  47 #include <asm/uaccess.h>
  48 #include <asm/system.h>
  49 #include <linux/module.h>
  50 #include <linux/types.h>
  51 #include <linux/kernel.h>
  52 #include <linux/sched.h>
  53 #include <linux/mm.h>
  54 #include <linux/string.h>
  55 #include <linux/errno.h>
  56 #include <linux/config.h>
  57
  58 #include <linux/socket.h>
  59 #include <linux/sockios.h>
  60 #include <linux/in.h>
  61 #include <linux/inet.h>
  62 #include <linux/netdevice.h>
  63 #include <linux/etherdevice.h>
  64 #include <linux/proc_fs.h>
  65 #include <linux/stat.h>
  66 #include <linux/init.h>
  67
  68 #include <net/snmp.h>
  69 #include <net/ip.h>
  70 #include <net/protocol.h>
  71 #include <net/route.h>
  72 #include <linux/skbuff.h>
  73 #include <net/sock.h>
  74 #include <net/arp.h>
  75 #include <net/icmp.h>
  76 #include <net/checksum.h>
  77 #include <net/inetpeer.h>
  78 #include <net/checksum.h>
  79 #include <linux/igmp.h>
  80 #include <linux/netfilter_ipv4.h>
  81 #include <linux/netfilter_bridge.h>
  82 #include <linux/mroute.h>
  83 #include <linux/netlink.h>
  84 #include <linux/tcp.h>
  85
  86 int sysctl_ip_default_ttl = IPDEFTTL;
  87
  88 static int ip_fragment(struct sk_buff *skb, int (*output)(struct sk_buff*));
  89
  90 /* Generate a checksum for an outgoing IP datagram. */
  91 __inline__ void ip_send_check(struct iphdr *iph)
  92 {
  93         iph->check = 0;
  94         iph->check = ip_fast_csum((unsigned char *)iph, iph->ihl);
  95 }
  96
  97 /* dev_loopback_xmit for use with netfilter. */
  98 static int ip_dev_loopback_xmit(struct sk_buff *newskb)
  99 {
 100         newskb->mac.raw = newskb->data;
 101         __skb_pull(newskb, newskb->nh.raw - newskb->data);
 102         newskb->pkt_type = PACKET_LOOPBACK;
 103         newskb->ip_summed = CHECKSUM_UNNECESSARY;
 104         BUG_TRAP(newskb->dst);
 105         netif_rx(newskb);
 106         return 0;
 107 }
 108
 109 static inline int ip_select_ttl(struct inet_sock *inet, struct dst_entry *dst)
 110 {
 111         int ttl = inet->uc_ttl;
 112
 113         if (ttl < 0)
 114                 ttl = dst_metric(dst, RTAX_HOPLIMIT);
 115         return ttl;
 116 }
 117
 118 /*
 119  *              Add an ip header to a skbuff and send it out.
 120  *
 121  */
 122 int ip_build_and_send_pkt(struct sk_buff *skb, struct sock *sk,
 123                           u32 saddr, u32 daddr, struct ip_options *opt)
 124 {
 125         struct inet_sock *inet = inet_sk(sk);
 126         struct rtable *rt = (struct rtable *)skb->dst;
 127         struct iphdr *iph;
 128
 129         /* Build the IP header. */
 130         if (opt)
 131                 iph=(struct iphdr *)skb_push(skb,sizeof(struct iphdr) + opt->optlen);
 132         else
 133                 iph=(struct iphdr *)skb_push(skb,sizeof(struct iphdr));
 134
 135         iph->version  = 4;
 136         iph->ihl      = 5;
 137         iph->tos      = inet->tos;
 138         if (ip_dont_fragment(sk, &rt->u.dst))
 139                 iph->frag_off = htons(IP_DF);
 140         else
 141                 iph->frag_off = 0;
 142         iph->ttl      = ip_select_ttl(inet, &rt->u.dst);
 143         iph->daddr    = rt->rt_dst;
 144         iph->saddr    = rt->rt_src;
 145         iph->protocol = sk->sk_protocol;
 146         iph->tot_len  = htons(skb->len);
 147         ip_select_ident(iph, &rt->u.dst, sk);
 148         skb->nh.iph   = iph;
 149
 150         if (opt && opt->optlen) {
 151                 iph->ihl += opt->optlen>>2;
 152                 ip_options_build(skb, opt, daddr, rt, 0);
 153         }
 154         ip_send_check(iph);
 155
 156         skb->priority = sk->sk_priority;
 157
 158         /* Send it out. */
 159         return NF_HOOK(PF_INET, NF_IP_LOCAL_OUT, skb, NULL, rt->u.dst.dev,
 160                        dst_output);
 161 }
 162
 163 EXPORT_SYMBOL_GPL(ip_build_and_send_pkt);
 164
 165 static inline int ip_finish_output2(struct sk_buff *skb)
 166 {
 167         struct dst_entry *dst = skb->dst;
 168         struct hh_cache *hh = dst->hh;
 169         struct net_device *dev = dst->dev;
 170         int hh_len = LL_RESERVED_SPACE(dev);
 171
 172         /* Be paranoid, rather than too clever. */
 173         if (unlikely(skb_headroom(skb) < hh_len && dev->hard_header)) {
 174                 struct sk_buff *skb2;
 175
 176                 skb2 = skb_realloc_headroom(skb, LL_RESERVED_SPACE(dev));
 177                 if (skb2 == NULL) {
 178                         kfree_skb(skb);
 179                         return -ENOMEM;
 180                 }
 181                 if (skb->sk)
 182                         skb_set_owner_w(skb2, skb->sk);
 183                 kfree_skb(skb);
 184                 skb = skb2;
 185         }
 186
 187         if (hh) {
 188                 int hh_alen;
 189
 190                 read_lock_bh(&hh->hh_lock);
 191                 hh_alen = HH_DATA_ALIGN(hh->hh_len);
 192                 memcpy(skb->data - hh_alen, hh->hh_data, hh_alen);
 193                 read_unlock_bh(&hh->hh_lock);
 194                 skb_push(skb, hh->hh_len);
 195                 return hh->hh_output(skb);
 196         } else if (dst->neighbour)
 197                 return dst->neighbour->output(skb);
 198
 199         if (net_ratelimit())
 200                 printk(KERN_DEBUG "ip_finish_output2: No header cache and no neighbour!\n");
 201         kfree_skb(skb);
 202         return -EINVAL;
 203 }
 204
 205 static inline int ip_finish_output(struct sk_buff *skb)
 206 {
 207 #if defined(CONFIG_NETFILTER) && defined(CONFIG_XFRM)
 208         /* Policy lookup after SNAT yielded a new policy */
 209         if (skb->dst->xfrm != NULL)
 210                 return xfrm4_output_finish(skb);
 211 #endif
 212         if (skb->len > dst_mtu(skb->dst) &&
 213             !(skb_shinfo(skb)->ufo_size || skb_shinfo(skb)->tso_size))
 214                 return ip_fragment(skb, ip_finish_output2);
 215         else
 216                 return ip_finish_output2(skb);
 217 }
 218
 219 int ip_mc_output(struct sk_buff *skb)
 220 {
 221         struct sock *sk = skb->sk;
 222         struct rtable *rt = (struct rtable*)skb->dst;
 223         struct net_device *dev = rt->u.dst.dev;
 224
 225         /*
 226          *      If the indicated interface is up and running, send the packet.
 227          */
 228         IP_INC_STATS(IPSTATS_MIB_OUTREQUESTS);
 229
 230         skb->dev = dev;
 231         skb->protocol = htons(ETH_P_IP);
 232
 233         /*
 234          *      Multicasts are looped back for other local users
 235          */
 236
 237         if (rt->rt_flags&RTCF_MULTICAST) {
 238                 if ((!sk || inet_sk(sk)->mc_loop)
 239 #ifdef CONFIG_IP_MROUTE
 240                 /* Small optimization: do not loopback not local frames,
 241                    which returned after forwarding; they will be  dropped
 242                    by ip_mr_input in any case.
 243                    Note, that local frames are looped back to be delivered
 244                    to local recipients.
 245
 246                    This check is duplicated in ip_mr_input at the moment.
 247                  */
 248                     && ((rt->rt_flags&RTCF_LOCAL) || !(IPCB(skb)->flags&IPSKB_FORWARDED))
 249 #endif
 250                 ) {
 251                         struct sk_buff *newskb = skb_clone(skb, GFP_ATOMIC);
 252                         if (newskb)
 253                                 NF_HOOK(PF_INET, NF_IP_POST_ROUTING, newskb, NULL,
 254                                         newskb->dev,
 255                                         ip_dev_loopback_xmit);
 256                 }
 257
 258                 /* Multicasts with ttl 0 must not go beyond the host */
 259
 260                 if (skb->nh.iph->ttl == 0) {
 261                         kfree_skb(skb);
 262                         return 0;
 263                 }
 264         }
 265
 266         if (rt->rt_flags&RTCF_BROADCAST) {
 267                 struct sk_buff *newskb = skb_clone(skb, GFP_ATOMIC);
 268                 if (newskb)
 269                         NF_HOOK(PF_INET, NF_IP_POST_ROUTING, newskb, NULL,
 270                                 newskb->dev, ip_dev_loopback_xmit);
 271         }
 272
 273         return NF_HOOK(PF_INET, NF_IP_POST_ROUTING, skb, NULL, skb->dev,
 274                        ip_finish_output);
 275 }
 276
 277 int ip_output(struct sk_buff *skb)
 278 {
 279         struct net_device *dev = skb->dst->dev;
 280
 281         IP_INC_STATS(IPSTATS_MIB_OUTREQUESTS);
 282
 283         skb->dev = dev;
 284         skb->protocol = htons(ETH_P_IP);
 285
 286         return NF_HOOK(PF_INET, NF_IP_POST_ROUTING, skb, NULL, dev,
 287                        ip_finish_output);
 288 }
 289
 290 int ip_queue_xmit(struct sk_buff *skb, int ipfragok)
 291 {
 292         struct sock *sk = skb->sk;
 293         struct inet_sock *inet = inet_sk(sk);
 294         struct ip_options *opt = inet->opt;
 295         struct rtable *rt;
 296         struct iphdr *iph;
 297
 298         /* Skip all of this if the packet is already routed,
 299          * f.e. by something like SCTP.
 300          */
 301         rt = (struct rtable *) skb->dst;
 302         if (rt != NULL)
 303                 goto packet_routed;
 304
 305         /* Make sure we can route this packet. */
 306         rt = (struct rtable *)__sk_dst_check(sk, 0);
 307         if (rt == NULL) {
 308                 u32 daddr;
 309
 310                 /* Use correct destination address if we have options. */
 311                 daddr = inet->daddr;
 312                 if(opt && opt->srr)
 313                         daddr = opt->faddr;
 314
 315                 {
 316                         struct flowi fl = { .oif = sk->sk_bound_dev_if,
 317                                             .nl_u = { .ip4_u =
 318                                                       { .daddr = daddr,
 319                                                         .saddr = inet->saddr,
 320                                                         .tos = RT_CONN_FLAGS(sk) } },
 321                                             .proto = sk->sk_protocol,
 322                                             .uli_u = { .ports =
 323                                                        { .sport = inet->sport,
 324                                                          .dport = inet->dport } } };
 325
 326                         /* If this fails, retransmit mechanism of transport layer will
 327                          * keep trying until route appears or the connection times
 328                          * itself out.
 329                          */
 330                         if (ip_route_output_flow(&rt, &fl, sk, 0))
 331                                 goto no_route;
 332                 }
 333                 sk_setup_caps(sk, &rt->u.dst);
 334         }
 335         skb->dst = dst_clone(&rt->u.dst);
 336
 337 packet_routed:
 338         if (opt && opt->is_strictroute && rt->rt_dst != rt->rt_gateway)
 339                 goto no_route;
 340
 341         /* OK, we know where to send it, allocate and build IP header. */
 342         iph = (struct iphdr *) skb_push(skb, sizeof(struct iphdr) + (opt ? opt->optlen : 0));
 343         *((__u16 *)iph) = htons((4 << 12) | (5 << 8) | (inet->tos & 0xff));
 344         iph->tot_len = htons(skb->len);
 345         if (ip_dont_fragment(sk, &rt->u.dst) && !ipfragok)
 346                 iph->frag_off = htons(IP_DF);
 347         else
 348                 iph->frag_off = 0;
 349         iph->ttl      = ip_select_ttl(inet, &rt->u.dst);
 350         iph->protocol = sk->sk_protocol;
 351         iph->saddr    = rt->rt_src;
 352         iph->daddr    = rt->rt_dst;
 353         skb->nh.iph   = iph;
 354         /* Transport layer set skb->h.foo itself. */
 355
 356         if (opt && opt->optlen) {
 357                 iph->ihl += opt->optlen >> 2;
 358                 ip_options_build(skb, opt, inet->daddr, rt, 0);
 359         }
 360
 361         ip_select_ident_more(iph, &rt->u.dst, sk,
 362                              (skb_shinfo(skb)->tso_segs ?: 1) - 1);
 363
 364         /* Add an IP checksum. */
 365         ip_send_check(iph);
 366
 367         skb->priority = sk->sk_priority;
 368
 369         return NF_HOOK(PF_INET, NF_IP_LOCAL_OUT, skb, NULL, rt->u.dst.dev,
 370                        dst_output);
 371
 372 no_route:
 373         IP_INC_STATS(IPSTATS_MIB_OUTNOROUTES);
 374         kfree_skb(skb);
 375         return -EHOSTUNREACH;
 376 }
 377
 378
 379 static void ip_copy_metadata(struct sk_buff *to, struct sk_buff *from)
 380 {
 381         to->pkt_type = from->pkt_type;
 382         to->priority = from->priority;
 383         to->protocol = from->protocol;
 384         dst_release(to->dst);
 385         to->dst = dst_clone(from->dst);
 386         to->dev = from->dev;
 387
 388         /* Copy the flags to each fragment. */
 389         IPCB(to)->flags = IPCB(from)->flags;
 390
 391 #ifdef CONFIG_NET_SCHED
 392         to->tc_index = from->tc_index;
 393 #endif
 394 #ifdef CONFIG_NETFILTER
 395         to->nfmark = from->nfmark;
 396         /* Connection association is same as pre-frag packet */
 397         nf_conntrack_put(to->nfct);
 398         to->nfct = from->nfct;
 399         nf_conntrack_get(to->nfct);
 400         to->nfctinfo = from->nfctinfo;
 401 #if defined(CONFIG_IP_VS) || defined(CONFIG_IP_VS_MODULE)
 402         to->ipvs_property = from->ipvs_property;
 403 #endif
 404 #ifdef CONFIG_BRIDGE_NETFILTER
 405         nf_bridge_put(to->nf_bridge);
 406         to->nf_bridge = from->nf_bridge;
 407         nf_bridge_get(to->nf_bridge);
 408 #endif
 409 #endif
 410 }
 411
 412 /*
 413  *      This IP datagram is too large to be sent in one piece.  Break it up into
 414  *      smaller pieces (each of size equal to IP header plus
 415  *      a block of the data of the original IP data part) that will yet fit in a
 416  *      single device frame, and queue such a frame for sending.
 417  */
 418
 419 static int ip_fragment(struct sk_buff *skb, int (*output)(struct sk_buff*))
 420 {
 421         struct iphdr *iph;
 422         int raw = 0;
 423         int ptr;
 424         struct net_device *dev;
 425         struct sk_buff *skb2;
 426         unsigned int mtu, hlen, left, len, ll_rs;
 427         int offset;
 428         __be16 not_last_frag;
 429         struct rtable *rt = (struct rtable*)skb->dst;
 430         int err = 0;
 431
 432         dev = rt->u.dst.dev;
 433
 434         /*
 435          *      Point into the IP datagram header.
 436          */
 437
 438         iph = skb->nh.iph;
 439
 440         if (unlikely((iph->frag_off & htons(IP_DF)) && !skb->local_df)) {
 441                 icmp_send(skb, ICMP_DEST_UNREACH, ICMP_FRAG_NEEDED,
 442                           htonl(dst_mtu(&rt->u.dst)));
 443                 kfree_skb(skb);
 444                 return -EMSGSIZE;
 445         }
 446
 447         /*
 448          *      Setup starting values.
 449          */
 450
 451         hlen = iph->ihl * 4;
 452         mtu = dst_mtu(&rt->u.dst) - hlen;       /* Size of data space */
 453         IPCB(skb)->flags |= IPSKB_FRAG_COMPLETE;
 454
 455         /* When frag_list is given, use it. First, check its validity:
 456          * some transformers could create wrong frag_list or break existing
 457          * one, it is not prohibited. In this case fall back to copying.
 458          *
 459          * LATER: this step can be merged to real generation of fragments,
 460          * we can switch to copy when see the first bad fragment.
 461          */
 462         if (skb_shinfo(skb)->frag_list) {
 463                 struct sk_buff *frag;
 464                 int first_len = skb_pagelen(skb);
 465
 466                 if (first_len - hlen > mtu ||
 467                     ((first_len - hlen) & 7) ||
 468                     (iph->frag_off & htons(IP_MF|IP_OFFSET)) ||
 469                     skb_cloned(skb))
 470                         goto slow_path;
 471
 472                 for (frag = skb_shinfo(skb)->frag_list; frag; frag = frag->next) {
 473                         /* Correct geometry. */
 474                         if (frag->len > mtu ||
 475                             ((frag->len & 7) && frag->next) ||
 476                             skb_headroom(frag) < hlen)
 477                             goto slow_path;
 478
 479                         /* Partially cloned skb? */
 480                         if (skb_shared(frag))
 481                                 goto slow_path;
 482
 483                         BUG_ON(frag->sk);
 484                         if (skb->sk) {
 485                                 sock_hold(skb->sk);
 486                                 frag->sk = skb->sk;
 487                                 frag->destructor = sock_wfree;
 488                                 skb->truesize -= frag->truesize;
 489                         }
 490                 }
 491
 492                 /* Everything is OK. Generate! */
 493
 494                 err = 0;
 495                 offset = 0;
 496                 frag = skb_shinfo(skb)->frag_list;
 497                 skb_shinfo(skb)->frag_list = NULL;
 498                 skb->data_len = first_len - skb_headlen(skb);
 499                 skb->len = first_len;
 500                 iph->tot_len = htons(first_len);
 501                 iph->frag_off = htons(IP_MF);
 502                 ip_send_check(iph);
 503
 504                 for (;;) {
 505                         /* Prepare header of the next frame,
 506                          * before previous one went down. */
 507                         if (frag) {
 508                                 frag->ip_summed = CHECKSUM_NONE;
 509                                 frag->h.raw = frag->data;
 510                                 frag->nh.raw = __skb_push(frag, hlen);
 511                                 memcpy(frag->nh.raw, iph, hlen);
 512                                 iph = frag->nh.iph;
 513                                 iph->tot_len = htons(frag->len);
 514                                 ip_copy_metadata(frag, skb);
 515                                 if (offset == 0)
 516                                         ip_options_fragment(frag);
 517                                 offset += skb->len - hlen;
 518                                 iph->frag_off = htons(offset>>3);
 519                                 if (frag->next != NULL)
 520                                         iph->frag_off |= htons(IP_MF);
 521                                 /* Ready, complete checksum */
 522                                 ip_send_check(iph);
 523                         }
 524
 525                         err = output(skb);
 526
 527                         if (err || !frag)
 528                                 break;
 529
 530                         skb = frag;
 531                         frag = skb->next;
 532                         skb->next = NULL;
 533                 }
 534
 535                 if (err == 0) {
 536                         IP_INC_STATS(IPSTATS_MIB_FRAGOKS);
 537                         return 0;
 538                 }
 539
 540                 while (frag) {
 541                         skb = frag->next;
 542                         kfree_skb(frag);
 543                         frag = skb;
 544                 }
 545                 IP_INC_STATS(IPSTATS_MIB_FRAGFAILS);
 546                 return err;
 547         }
 548
 549 slow_path:
 550         left = skb->len - hlen;         /* Space per frame */
 551         ptr = raw + hlen;               /* Where to start from */
 552
 553 #ifdef CONFIG_BRIDGE_NETFILTER
 554         /* for bridged IP traffic encapsulated inside f.e. a vlan header,
 555          * we need to make room for the encapsulating header */
 556         ll_rs = LL_RESERVED_SPACE_EXTRA(rt->u.dst.dev, nf_bridge_pad(skb));
 557         mtu -= nf_bridge_pad(skb);
 558 #else
 559         ll_rs = LL_RESERVED_SPACE(rt->u.dst.dev);
 560 #endif
 561         /*
 562          *      Fragment the datagram.
 563          */
 564
 565         offset = (ntohs(iph->frag_off) & IP_OFFSET) << 3;
 566         not_last_frag = iph->frag_off & htons(IP_MF);
 567
 568         /*
 569          *      Keep copying data until we run out.
 570          */
 571
 572         while(left > 0) {
 573                 len = left;
 574                 /* IF: it doesn't fit, use 'mtu' - the data space left */
 575                 if (len > mtu)
 576                         len = mtu;
 577                 /* IF: we are not sending upto and including the packet end
 578                    then align the next start on an eight byte boundary */
 579                 if (len < left) {
 580                         len &= ~7;
 581                 }
 582                 /*
 583                  *      Allocate buffer.
 584                  */
 585
 586                 if ((skb2 = alloc_skb(len+hlen+ll_rs, GFP_ATOMIC)) == NULL) {
 587                         NETDEBUG(KERN_INFO "IP: frag: no memory for new fragment!\n");
 588                         err = -ENOMEM;
 589                         goto fail;
 590                 }
 591
 592                 /*
 593                  *      Set up data on packet
 594                  */
 595
 596                 ip_copy_metadata(skb2, skb);
 597                 skb_reserve(skb2, ll_rs);
 598                 skb_put(skb2, len + hlen);
 599                 skb2->nh.raw = skb2->data;
 600                 skb2->h.raw = skb2->data + hlen;
 601
 602                 /*
 603                  *      Charge the memory for the fragment to any owner
 604                  *      it might possess
 605                  */
 606
 607                 if (skb->sk)
 608                         skb_set_owner_w(skb2, skb->sk);
 609
 610                 /*
 611                  *      Copy the packet header into the new buffer.
 612                  */
 613
 614                 memcpy(skb2->nh.raw, skb->data, hlen);
 615
 616                 /*
 617                  *      Copy a block of the IP datagram.
 618                  */
 619                 if (skb_copy_bits(skb, ptr, skb2->h.raw, len))
 620                         BUG();
 621                 left -= len;
 622
 623                 /*
 624                  *      Fill in the new header fields.
 625                  */
 626                 iph = skb2->nh.iph;
 627                 iph->frag_off = htons((offset >> 3));
 628
 629                 /* ANK: dirty, but effective trick. Upgrade options only if
 630                  * the segment to be fragmented was THE FIRST (otherwise,
 631                  * options are already fixed) and make it ONCE
 632                  * on the initial skb, so that all the following fragments
 633                  * will inherit fixed options.
 634                  */
 635                 if (offset == 0)
 636                         ip_options_fragment(skb);
 637
 638                 /*
 639                  *      Added AC : If we are fragmenting a fragment that's not the
 640                  *                 last fragment then keep MF on each bit
 641                  */
 642                 if (left > 0 || not_last_frag)
 643                         iph->frag_off |= htons(IP_MF);
 644                 ptr += len;
 645                 offset += len;
 646
 647                 /*
 648                  *      Put this fragment into the sending queue.
 649                  */
 650
 651                 IP_INC_STATS(IPSTATS_MIB_FRAGCREATES);
 652
 653                 iph->tot_len = htons(len + hlen);
 654
 655                 ip_send_check(iph);
 656
 657                 err = output(skb2);
 658                 if (err)
 659                         goto fail;
 660         }
 661         kfree_skb(skb);
 662         IP_INC_STATS(IPSTATS_MIB_FRAGOKS);
 663         return err;
 664
 665 fail:
 666         kfree_skb(skb);
 667         IP_INC_STATS(IPSTATS_MIB_FRAGFAILS);
 668         return err;
 669 }
 670
 671 int
 672 ip_generic_getfrag(void *from, char *to, int offset, int len, int odd, struct sk_buff *skb)
 673 {
 674         struct iovec *iov = from;
 675
 676         if (skb->ip_summed == CHECKSUM_HW) {
 677                 if (memcpy_fromiovecend(to, iov, offset, len) < 0)
 678                         return -EFAULT;
 679         } else {
 680                 unsigned int csum = 0;
 681                 if (csum_partial_copy_fromiovecend(to, iov, offset, len, &csum) < 0)
 682                         return -EFAULT;
 683                 skb->csum = csum_block_add(skb->csum, csum, odd);
 684         }
 685         return 0;
 686 }
 687
 688 static inline unsigned int
 689 csum_page(struct page *page, int offset, int copy)
 690 {
 691         char *kaddr;
 692         unsigned int csum;
 693         kaddr = kmap(page);
 694         csum = csum_partial(kaddr + offset, copy, 0);
 695         kunmap(page);
 696         return csum;
 697 }
 698
 699 static inline int ip_ufo_append_data(struct sock *sk,
 700                         int getfrag(void *from, char *to, int offset, int len,
 701                                int odd, struct sk_buff *skb),
 702                         void *from, int length, int hh_len, int fragheaderlen,
 703                         int transhdrlen, int mtu,unsigned int flags)
 704 {
 705         struct sk_buff *skb;
 706         int err;
 707
 708         /* There is support for UDP fragmentation offload by network
 709          * device, so create one single skb packet containing complete
 710          * udp datagram
 711          */
 712         if ((skb = skb_peek_tail(&sk->sk_write_queue)) == NULL) {
 713                 skb = sock_alloc_send_skb(sk,
 714                         hh_len + fragheaderlen + transhdrlen + 20,
 715                         (flags & MSG_DONTWAIT), &err);
 716
 717                 if (skb == NULL)
 718                         return err;
 719
 720                 /* reserve space for Hardware header */
 721                 skb_reserve(skb, hh_len);
 722
 723                 /* create space for UDP/IP header */
 724                 skb_put(skb,fragheaderlen + transhdrlen);
 725
 726                 /* initialize network header pointer */
 727                 skb->nh.raw = skb->data;
 728
 729                 /* initialize protocol header pointer */
 730                 skb->h.raw = skb->data + fragheaderlen;
 731
 732                 skb->ip_summed = CHECKSUM_HW;
 733                 skb->csum = 0;
 734                 sk->sk_sndmsg_off = 0;
 735         }
 736
 737         err = skb_append_datato_frags(sk,skb, getfrag, from,
 738                                (length - transhdrlen));
 739         if (!err) {
 740                 /* specify the length of each IP datagram fragment*/
 741                 skb_shinfo(skb)->ufo_size = (mtu - fragheaderlen);
 742                 __skb_queue_tail(&sk->sk_write_queue, skb);
 743
 744                 return 0;
 745         }
 746         /* There is not enough support do UFO ,
 747          * so follow normal path
 748          */
 749         kfree_skb(skb);
 750         return err;
 751 }
 752
 753 /*
 754  *      ip_append_data() and ip_append_page() can make one large IP datagram
 755  *      from many pieces of data. Each pieces will be holded on the socket
 756  *      until ip_push_pending_frames() is called. Each piece can be a page
 757  *      or non-page data.
 758  *
 759  *      Not only UDP, other transport protocols - e.g. raw sockets - can use
 760  *      this interface potentially.
 761  *
 762  *      LATER: length must be adjusted by pad at tail, when it is required.
 763  */
 764 int ip_append_data(struct sock *sk,
 765                    int getfrag(void *from, char *to, int offset, int len,
 766                                int odd, struct sk_buff *skb),
 767                    void *from, int length, int transhdrlen,
 768                    struct ipcm_cookie *ipc, struct rtable *rt,
 769                    unsigned int flags)
 770 {
 771         struct inet_sock *inet = inet_sk(sk);
 772         struct sk_buff *skb;
 773
 774         struct ip_options *opt = NULL;
 775         int hh_len;
 776         int exthdrlen;
 777         int mtu;
 778         int copy;
 779         int err;
 780         int offset = 0;
 781         unsigned int maxfraglen, fragheaderlen;
 782         int csummode = CHECKSUM_NONE;
 783
 784         if (flags&MSG_PROBE)
 785                 return 0;
 786
 787         if (skb_queue_empty(&sk->sk_write_queue)) {
 788                 /*
 789                  * setup for corking.
 790                  */
 791                 opt = ipc->opt;
 792                 if (opt) {
 793                         if (inet->cork.opt == NULL) {
 794                                 inet->cork.opt = kmalloc(sizeof(struct ip_options) + 40, sk->sk_allocation);
 795                                 if (unlikely(inet->cork.opt == NULL))
 796                                         return -ENOBUFS;
 797                         }
 798                         memcpy(inet->cork.opt, opt, sizeof(struct ip_options)+opt->optlen);
 799                         inet->cork.flags |= IPCORK_OPT;
 800                         inet->cork.addr = ipc->addr;
 801                 }
 802                 dst_hold(&rt->u.dst);
 803                 inet->cork.fragsize = mtu = dst_mtu(rt->u.dst.path);
 804                 inet->cork.rt = rt;
 805                 inet->cork.length = 0;
 806                 sk->sk_sndmsg_page = NULL;
 807                 sk->sk_sndmsg_off = 0;
 808                 if ((exthdrlen = rt->u.dst.header_len) != 0) {
 809                         length += exthdrlen;
 810                         transhdrlen += exthdrlen;
 811                 }
 812         } else {
 813                 rt = inet->cork.rt;
 814                 if (inet->cork.flags & IPCORK_OPT)
 815                         opt = inet->cork.opt;
 816
 817                 transhdrlen = 0;
 818                 exthdrlen = 0;
 819                 mtu = inet->cork.fragsize;
 820         }
 821         hh_len = LL_RESERVED_SPACE(rt->u.dst.dev);
 822
 823         fragheaderlen = sizeof(struct iphdr) + (opt ? opt->optlen : 0);
 824         maxfraglen = ((mtu - fragheaderlen) & ~7) + fragheaderlen;
 825
 826         if (inet->cork.length + length > 0xFFFF - fragheaderlen) {
 827                 ip_local_error(sk, EMSGSIZE, rt->rt_dst, inet->dport, mtu-exthdrlen);
 828                 return -EMSGSIZE;
 829         }
 830
 831         /*
 832          * transhdrlen > 0 means that this is the first fragment and we wish
 833          * it won't be fragmented in the future.
 834          */
 835         if (transhdrlen &&
 836             length + fragheaderlen <= mtu &&
 837             rt->u.dst.dev->features&(NETIF_F_IP_CSUM|NETIF_F_NO_CSUM|NETIF_F_HW_CSUM) &&
 838             !exthdrlen)
 839                 csummode = CHECKSUM_HW;
 840
 841         inet->cork.length += length;
 842         if (((length > mtu) && (sk->sk_protocol == IPPROTO_UDP)) &&
 843                         (rt->u.dst.dev->features & NETIF_F_UFO)) {
 844
 845                 if(ip_ufo_append_data(sk, getfrag, from, length, hh_len,
 846                                fragheaderlen, transhdrlen, mtu, flags))
 847                         goto error;
 848
 849                 return 0;
 850         }
 851
 852         /* So, what's going on in the loop below?
 853          *
 854          * We use calculated fragment length to generate chained skb,
 855          * each of segments is IP fragment ready for sending to network after
 856          * adding appropriate IP header.
 857          */
 858
 859         if ((skb = skb_peek_tail(&sk->sk_write_queue)) == NULL)
 860                 goto alloc_new_skb;
 861
 862         while (length > 0) {
 863                 /* Check if the remaining data fits into current packet. */
 864                 copy = mtu - skb->len;
 865                 if (copy < length)
 866                         copy = maxfraglen - skb->len;
 867                 if (copy <= 0) {
 868                         char *data;
 869                         unsigned int datalen;
 870                         unsigned int fraglen;
 871                         unsigned int fraggap;
 872                         unsigned int alloclen;
 873                         struct sk_buff *skb_prev;
 874 alloc_new_skb:
 875                         skb_prev = skb;
 876                         if (skb_prev)
 877                                 fraggap = skb_prev->len - maxfraglen;
 878                         else
 879                                 fraggap = 0;
 880
 881                         /*
 882                          * If remaining data exceeds the mtu,
 883                          * we know we need more fragment(s).
 884                          */
 885                         datalen = length + fraggap;
 886                         if (datalen > mtu - fragheaderlen)
 887                                 datalen = maxfraglen - fragheaderlen;
 888                         fraglen = datalen + fragheaderlen;
 889
 890                         if ((flags & MSG_MORE) &&
 891                             !(rt->u.dst.dev->features&NETIF_F_SG))
 892                                 alloclen = mtu;
 893                         else
 894                                 alloclen = datalen + fragheaderlen;
 895
 896                         /* The last fragment gets additional space at tail.
 897                          * Note, with MSG_MORE we overallocate on fragments,
 898                          * because we have no idea what fragment will be
 899                          * the last.
 900                          */
 901                         if (datalen == length)
 902                                 alloclen += rt->u.dst.trailer_len;
 903
 904                         if (transhdrlen) {
 905                                 skb = sock_alloc_send_skb(sk,
 906                                                 alloclen + hh_len + 15,
 907                                                 (flags & MSG_DONTWAIT), &err);
 908                         } else {
 909                                 skb = NULL;
 910                                 if (atomic_read(&sk->sk_wmem_alloc) <=
 911                                     2 * sk->sk_sndbuf)
 912                                         skb = sock_wmalloc(sk,
 913                                                            alloclen + hh_len + 15, 1,
 914                                                            sk->sk_allocation);
 915                                 if (unlikely(skb == NULL))
 916                                         err = -ENOBUFS;
 917                         }
 918                         if (skb == NULL)
 919                                 goto error;
 920
 921                         /*
 922                          *      Fill in the control structures
 923                          */
 924                         skb->ip_summed = csummode;
 925                         skb->csum = 0;
 926                         skb_reserve(skb, hh_len);
 927
 928                         /*
 929                          *      Find where to start putting bytes.
 930                          */
 931                         data = skb_put(skb, fraglen);
 932                         skb->nh.raw = data + exthdrlen;
 933                         data += fragheaderlen;
 934                         skb->h.raw = data + exthdrlen;
 935
 936                         if (fraggap) {
 937                                 skb->csum = skb_copy_and_csum_bits(
 938                                         skb_prev, maxfraglen,
 939                                         data + transhdrlen, fraggap, 0);
 940                                 skb_prev->csum = csum_sub(skb_prev->csum,
 941                                                           skb->csum);
 942                                 data += fraggap;
 943                                 skb_trim(skb_prev, maxfraglen);
 944                         }
 945
 946                         copy = datalen - transhdrlen - fraggap;
 947                         if (copy > 0 && getfrag(from, data + transhdrlen, offset, copy, fraggap, skb) < 0) {
 948                                 err = -EFAULT;
 949                                 kfree_skb(skb);
 950                                 goto error;
 951                         }
 952
 953                         offset += copy;
 954                         length -= datalen - fraggap;
 955                         transhdrlen = 0;
 956                         exthdrlen = 0;
 957                         csummode = CHECKSUM_NONE;
 958
 959                         /*
 960                          * Put the packet on the pending queue.
 961                          */
 962                         __skb_queue_tail(&sk->sk_write_queue, skb);
 963                         continue;
 964                 }
 965
 966                 if (copy > length)
 967                         copy = length;
 968
 969                 if (!(rt->u.dst.dev->features&NETIF_F_SG)) {
 970                         unsigned int off;
 971
 972                         off = skb->len;
 973                         if (getfrag(from, skb_put(skb, copy),
 974                                         offset, copy, off, skb) < 0) {
 975                                 __skb_trim(skb, off);
 976                                 err = -EFAULT;
 977                                 goto error;
 978                         }
 979                 } else {
 980                         int i = skb_shinfo(skb)->nr_frags;
 981                         skb_frag_t *frag = &skb_shinfo(skb)->frags[i-1];
 982                         struct page *page = sk->sk_sndmsg_page;
 983                         int off = sk->sk_sndmsg_off;
 984                         unsigned int left;
 985
 986                         if (page && (left = PAGE_SIZE - off) > 0) {
 987                                 if (copy >= left)
 988                                         copy = left;
 989                                 if (page != frag->page) {
 990                                         if (i == MAX_SKB_FRAGS) {
 991                                                 err = -EMSGSIZE;
 992                                                 goto error;
 993                                         }
 994                                         get_page(page);
 995                                         skb_fill_page_desc(skb, i, page, sk->sk_sndmsg_off, 0);
 996                                         frag = &skb_shinfo(skb)->frags[i];
 997                                 }
 998                         } else if (i < MAX_SKB_FRAGS) {
 999                                 if (copy > PAGE_SIZE)
1000                                         copy = PAGE_SIZE;
1001                                 page = alloc_pages(sk->sk_allocation, 0);
1002                                 if (page == NULL)  {
1003                                         err = -ENOMEM;
1004                                         goto error;
1005                                 }
1006                                 sk->sk_sndmsg_page = page;
1007                                 sk->sk_sndmsg_off = 0;
1008
1009                                 skb_fill_page_desc(skb, i, page, 0, 0);
1010                                 frag = &skb_shinfo(skb)->frags[i];
1011                                 skb->truesize += PAGE_SIZE;
1012                                 atomic_add(PAGE_SIZE, &sk->sk_wmem_alloc);
1013                         } else {
1014                                 err = -EMSGSIZE;
1015                                 goto error;
1016                         }
1017                         if (getfrag(from, page_address(frag->page)+frag->page_offset+frag->size, offset, copy, skb->len, skb) < 0) {
1018                                 err = -EFAULT;
1019                                 goto error;
1020                         }
1021                         sk->sk_sndmsg_off += copy;
1022                         frag->size += copy;
1023                         skb->len += copy;
1024                         skb->data_len += copy;
1025                 }
1026                 offset += copy;
1027                 length -= copy;
1028         }
1029
1030         return 0;
1031
1032 error:
1033         inet->cork.length -= length;
1034         IP_INC_STATS(IPSTATS_MIB_OUTDISCARDS);
1035         return err;
1036 }
1037
1038 ssize_t ip_append_page(struct sock *sk, struct page *page,
1039                        int offset, size_t size, int flags)
1040 {
1041         struct inet_sock *inet = inet_sk(sk);
1042         struct sk_buff *skb;
1043         struct rtable *rt;
1044         struct ip_options *opt = NULL;
1045         int hh_len;
1046         int mtu;
1047         int len;
1048         int err;
1049         unsigned int maxfraglen, fragheaderlen, fraggap;
1050
1051         if (inet->hdrincl)
1052                 return -EPERM;
1053
1054         if (flags&MSG_PROBE)
1055                 return 0;
1056
1057         if (skb_queue_empty(&sk->sk_write_queue))
1058                 return -EINVAL;
1059
1060         rt = inet->cork.rt;
1061         if (inet->cork.flags & IPCORK_OPT)
1062                 opt = inet->cork.opt;
1063
1064         if (!(rt->u.dst.dev->features&NETIF_F_SG))
1065                 return -EOPNOTSUPP;
1066
1067         hh_len = LL_RESERVED_SPACE(rt->u.dst.dev);
1068         mtu = inet->cork.fragsize;
1069
1070         fragheaderlen = sizeof(struct iphdr) + (opt ? opt->optlen : 0);
1071         maxfraglen = ((mtu - fragheaderlen) & ~7) + fragheaderlen;
1072
1073         if (inet->cork.length + size > 0xFFFF - fragheaderlen) {
1074                 ip_local_error(sk, EMSGSIZE, rt->rt_dst, inet->dport, mtu);
1075                 return -EMSGSIZE;
1076         }
1077
1078         if ((skb = skb_peek_tail(&sk->sk_write_queue)) == NULL)
1079                 return -EINVAL;
1080
1081         inet->cork.length += size;
1082         if ((sk->sk_protocol == IPPROTO_UDP) &&
1083             (rt->u.dst.dev->features & NETIF_F_UFO))
1084                 skb_shinfo(skb)->ufo_size = (mtu - fragheaderlen);
1085
1086
1087         while (size > 0) {
1088                 int i;
1089
1090                 if (skb_shinfo(skb)->ufo_size)
1091                         len = size;
1092                 else {
1093
1094                         /* Check if the remaining data fits into current packet. */
1095                         len = mtu - skb->len;
1096                         if (len < size)
1097                                 len = maxfraglen - skb->len;
1098                 }
1099                 if (len <= 0) {
1100                         struct sk_buff *skb_prev;
1101                         char *data;
1102                         struct iphdr *iph;
1103                         int alloclen;
1104
1105                         skb_prev = skb;
1106                         fraggap = skb_prev->len - maxfraglen;
1107
1108                         alloclen = fragheaderlen + hh_len + fraggap + 15;
1109                         skb = sock_wmalloc(sk, alloclen, 1, sk->sk_allocation);
1110                         if (unlikely(!skb)) {
1111                                 err = -ENOBUFS;
1112                                 goto error;
1113                         }
1114
1115                         /*
1116                          *      Fill in the control structures
1117                          */
1118                         skb->ip_summed = CHECKSUM_NONE;
1119                         skb->csum = 0;
1120                         skb_reserve(skb, hh_len);
1121
1122                         /*
1123                          *      Find where to start putting bytes.
1124                          */
1125                         data = skb_put(skb, fragheaderlen + fraggap);
1126                         skb->nh.iph = iph = (struct iphdr *)data;
1127                         data += fragheaderlen;
1128                         skb->h.raw = data;
1129
1130                         if (fraggap) {
1131                                 skb->csum = skb_copy_and_csum_bits(
1132                                         skb_prev, maxfraglen,
1133                                         data, fraggap, 0);
1134                                 skb_prev->csum = csum_sub(skb_prev->csum,
1135                                                           skb->csum);
1136                                 skb_trim(skb_prev, maxfraglen);
1137                         }
1138
1139                         /*
1140                          * Put the packet on the pending queue.
1141                          */
1142                         __skb_queue_tail(&sk->sk_write_queue, skb);
1143                         continue;
1144                 }
1145
1146                 i = skb_shinfo(skb)->nr_frags;
1147                 if (len > size)
1148                         len = size;
1149                 if (skb_can_coalesce(skb, i, page, offset)) {
1150                         skb_shinfo(skb)->frags[i-1].size += len;
1151                 } else if (i < MAX_SKB_FRAGS) {
1152                         get_page(page);
1153                         skb_fill_page_desc(skb, i, page, offset, len);
1154                 } else {
1155                         err = -EMSGSIZE;
1156                         goto error;
1157                 }
1158
1159                 if (skb->ip_summed == CHECKSUM_NONE) {
1160                         unsigned int csum;
1161                         csum = csum_page(page, offset, len);
1162                         skb->csum = csum_block_add(skb->csum, csum, skb->len);
1163                 }
1164
1165                 skb->len += len;
1166                 skb->data_len += len;
1167                 offset += len;
1168                 size -= len;
1169         }
1170         return 0;
1171
1172 error:
1173         inet->cork.length -= size;
1174         IP_INC_STATS(IPSTATS_MIB_OUTDISCARDS);
1175         return err;
1176 }
1177
1178 /*
1179  *      Combined all pending IP fragments on the socket as one IP datagram
1180  *      and push them out.
1181  */
1182 int ip_push_pending_frames(struct sock *sk)
1183 {
1184         struct sk_buff *skb, *tmp_skb;
1185         struct sk_buff **tail_skb;
1186         struct inet_sock *inet = inet_sk(sk);
1187         struct ip_options *opt = NULL;
1188         struct rtable *rt = inet->cork.rt;
1189         struct iphdr *iph;
1190         __be16 df = 0;
1191         __u8 ttl;
1192         int err = 0;
1193
1194         if ((skb = __skb_dequeue(&sk->sk_write_queue)) == NULL)
1195                 goto out;
1196         tail_skb = &(skb_shinfo(skb)->frag_list);
1197
1198         /* move skb->data to ip header from ext header */
1199         if (skb->data < skb->nh.raw)
1200                 __skb_pull(skb, skb->nh.raw - skb->data);
1201         while ((tmp_skb = __skb_dequeue(&sk->sk_write_queue)) != NULL) {
1202                 __skb_pull(tmp_skb, skb->h.raw - skb->nh.raw);
1203                 *tail_skb = tmp_skb;
1204                 tail_skb = &(tmp_skb->next);
1205                 skb->len += tmp_skb->len;
1206                 skb->data_len += tmp_skb->len;
1207                 skb->truesize += tmp_skb->truesize;
1208                 __sock_put(tmp_skb->sk);
1209                 tmp_skb->destructor = NULL;
1210                 tmp_skb->sk = NULL;
1211         }
1212
1213         /* Unless user demanded real pmtu discovery (IP_PMTUDISC_DO), we allow
1214          * to fragment the frame generated here. No matter, what transforms
1215          * how transforms change size of the packet, it will come out.
1216          */
1217         if (inet->pmtudisc != IP_PMTUDISC_DO)
1218                 skb->local_df = 1;
1219
1220         /* DF bit is set when we want to see DF on outgoing frames.
1221          * If local_df is set too, we still allow to fragment this frame
1222          * locally. */
1223         if (inet->pmtudisc == IP_PMTUDISC_DO ||
1224             (skb->len <= dst_mtu(&rt->u.dst) &&
1225              ip_dont_fragment(sk, &rt->u.dst)))
1226                 df = htons(IP_DF);
1227
1228         if (inet->cork.flags & IPCORK_OPT)
1229                 opt = inet->cork.opt;
1230
1231         if (rt->rt_type == RTN_MULTICAST)
1232                 ttl = inet->mc_ttl;
1233         else
1234                 ttl = ip_select_ttl(inet, &rt->u.dst);
1235
1236         iph = (struct iphdr *)skb->data;
1237         iph->version = 4;
1238         iph->ihl = 5;
1239         if (opt) {
1240                 iph->ihl += opt->optlen>>2;
1241                 ip_options_build(skb, opt, inet->cork.addr, rt, 0);
1242         }
1243         iph->tos = inet->tos;
1244         iph->tot_len = htons(skb->len);
1245         iph->frag_off = df;
1246         if (!df) {
1247                 __ip_select_ident(iph, &rt->u.dst, 0);
1248         } else {
1249                 iph->id = htons(inet->id++);
1250         }
1251         iph->ttl = ttl;
1252         iph->protocol = sk->sk_protocol;
1253         iph->saddr = rt->rt_src;
1254         iph->daddr = rt->rt_dst;
1255         ip_send_check(iph);
1256
1257         skb->priority = sk->sk_priority;
1258         skb->dst = dst_clone(&rt->u.dst);
1259
1260         /* Netfilter gets whole the not fragmented skb. */
1261         err = NF_HOOK(PF_INET, NF_IP_LOCAL_OUT, skb, NULL,
1262                       skb->dst->dev, dst_output);
1263         if (err) {
1264                 if (err > 0)
1265                         err = inet->recverr ? net_xmit_errno(err) : 0;
1266                 if (err)
1267                         goto error;
1268         }
1269
1270 out:
1271         inet->cork.flags &= ~IPCORK_OPT;
1272         kfree(inet->cork.opt);
1273         inet->cork.opt = NULL;
1274         if (inet->cork.rt) {
1275                 ip_rt_put(inet->cork.rt);
1276                 inet->cork.rt = NULL;
1277         }
1278         return err;
1279
1280 error:
1281         IP_INC_STATS(IPSTATS_MIB_OUTDISCARDS);
1282         goto out;
1283 }
1284
1285 /*
1286  *      Throw away all pending data on the socket.
1287  */
1288 void ip_flush_pending_frames(struct sock *sk)
1289 {
1290         struct inet_sock *inet = inet_sk(sk);
1291         struct sk_buff *skb;
1292
1293         while ((skb = __skb_dequeue_tail(&sk->sk_write_queue)) != NULL)
1294                 kfree_skb(skb);
1295
1296         inet->cork.flags &= ~IPCORK_OPT;
1297         kfree(inet->cork.opt);
1298         inet->cork.opt = NULL;
1299         if (inet->cork.rt) {
1300                 ip_rt_put(inet->cork.rt);
1301                 inet->cork.rt = NULL;
1302         }
1303 }
1304
1305
1306 /*
1307  *      Fetch data from kernel space and fill in checksum if needed.
1308  */
1309 static int ip_reply_glue_bits(void *dptr, char *to, int offset,
1310                               int len, int odd, struct sk_buff *skb)
1311 {
1312         unsigned int csum;
1313
1314         csum = csum_partial_copy_nocheck(dptr+offset, to, len, 0);
1315         skb->csum = csum_block_add(skb->csum, csum, odd);
1316         return 0;
1317 }
1318
1319 /*
1320  *      Generic function to send a packet as reply to another packet.
1321  *      Used to send TCP resets so far. ICMP should use this function too.
1322  *
1323  *      Should run single threaded per socket because it uses the sock
1324  *      structure to pass arguments.
1325  *
1326  *      LATER: switch from ip_build_xmit to ip_append_*
1327  */
1328 void ip_send_reply(struct sock *sk, struct sk_buff *skb, struct ip_reply_arg *arg,
1329                    unsigned int len)
1330 {
1331         struct inet_sock *inet = inet_sk(sk);
1332         struct {
1333                 struct ip_options       opt;
1334                 char                    data[40];
1335         } replyopts;
1336         struct ipcm_cookie ipc;
1337         u32 daddr;
1338         struct rtable *rt = (struct rtable*)skb->dst;
1339
1340         if (ip_options_echo(&replyopts.opt, skb))
1341                 return;
1342
1343         daddr = ipc.addr = rt->rt_src;
1344         ipc.opt = NULL;
1345
1346         if (replyopts.opt.optlen) {
1347                 ipc.opt = &replyopts.opt;
1348
1349                 if (ipc.opt->srr)
1350                         daddr = replyopts.opt.faddr;
1351         }
1352
1353         {
1354                 struct flowi fl = { .nl_u = { .ip4_u =
1355                                               { .daddr = daddr,
1356                                                 .saddr = rt->rt_spec_dst,
1357                                                 .tos = RT_TOS(skb->nh.iph->tos) } },
1358                                     /* Not quite clean, but right. */
1359                                     .uli_u = { .ports =
1360                                                { .sport = skb->h.th->dest,
1361                                                  .dport = skb->h.th->source } },
1362                                     .proto = sk->sk_protocol };
1363                 if (ip_route_output_key(&rt, &fl))
1364                         return;
1365         }
1366
1367         /* And let IP do all the hard work.
1368
1369            This chunk is not reenterable, hence spinlock.
1370            Note that it uses the fact, that this function is called
1371            with locally disabled BH and that sk cannot be already spinlocked.
1372          */
1373         bh_lock_sock(sk);
1374         inet->tos = skb->nh.iph->tos;
1375         sk->sk_priority = skb->priority;
1376         sk->sk_protocol = skb->nh.iph->protocol;
1377         ip_append_data(sk, ip_reply_glue_bits, arg->iov->iov_base, len, 0,
1378                        &ipc, rt, MSG_DONTWAIT);
1379         if ((skb = skb_peek(&sk->sk_write_queue)) != NULL) {
1380                 if (arg->csumoffset >= 0)
1381                         *((u16 *)skb->h.raw + arg->csumoffset) = csum_fold(csum_add(skb->csum, arg->csum));
1382                 skb->ip_summed = CHECKSUM_NONE;
1383                 ip_push_pending_frames(sk);
1384         }
1385
1386         bh_unlock_sock(sk);
1387
1388         ip_rt_put(rt);
1389 }
1390
1391 void __init ip_init(void)
1392 {
1393         ip_rt_init();
1394         inet_initpeers();
1395
1396 #if defined(CONFIG_IP_MULTICAST) && defined(CONFIG_PROC_FS)
1397         igmp_mc_proc_init();
1398 #endif
1399 }
1400
1401 EXPORT_SYMBOL(ip_generic_getfrag);
1402 EXPORT_SYMBOL(ip_queue_xmit);
1403 EXPORT_SYMBOL(ip_send_check);