ucc_geth: Fix the wrong the Rx/Tx FIFO size
[GitHub/mt8127/android_kernel_alcatel_ttab.git] / net / ipv4 / ip_output.c
CommitLineData
1da177e4
LT
1/*
2 * INET An implementation of the TCP/IP protocol suite for the LINUX
3 * operating system. INET is implemented using the BSD Socket
4 * interface as the means of communication with the user level.
5 *
6 * The Internet Protocol (IP) output module.
7 *
02c30a84 8 * Authors: Ross Biro
1da177e4
LT
9 * Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
10 * Donald Becker, <becker@super.org>
11 * Alan Cox, <Alan.Cox@linux.org>
12 * Richard Underwood
13 * Stefan Becker, <stefanb@yello.ping.de>
14 * Jorge Cwik, <jorge@laser.satlink.net>
15 * Arnt Gulbrandsen, <agulbra@nvg.unit.no>
16 * Hirokazu Takahashi, <taka@valinux.co.jp>
17 *
18 * See ip_input.c for original log
19 *
20 * Fixes:
21 * Alan Cox : Missing nonblock feature in ip_build_xmit.
22 * Mike Kilburn : htons() missing in ip_build_xmit.
e905a9ed 23 * Bradford Johnson: Fix faulty handling of some frames when
1da177e4
LT
24 * no route is found.
25 * Alexander Demenshin: Missing sk/skb free in ip_queue_xmit
26 * (in case if packet not accepted by
27 * output firewall rules)
28 * Mike McLagan : Routing by source
29 * Alexey Kuznetsov: use new route cache
30 * Andi Kleen: Fix broken PMTU recovery and remove
31 * some redundant tests.
32 * Vitaly E. Lavrov : Transparent proxy revived after year coma.
33 * Andi Kleen : Replace ip_reply with ip_send_reply.
e905a9ed
YH
34 * Andi Kleen : Split fast and slow ip_build_xmit path
35 * for decreased register pressure on x86
36 * and more readibility.
1da177e4
LT
37 * Marc Boucher : When call_out_firewall returns FW_QUEUE,
38 * silently drop skb instead of failing with -EPERM.
39 * Detlev Wengorz : Copy protocol for fragments.
40 * Hirokazu Takahashi: HW checksumming for outgoing UDP
41 * datagrams.
42 * Hirokazu Takahashi: sendfile() on UDP works now.
43 */
44
45#include <asm/uaccess.h>
46#include <asm/system.h>
47#include <linux/module.h>
48#include <linux/types.h>
49#include <linux/kernel.h>
1da177e4
LT
50#include <linux/mm.h>
51#include <linux/string.h>
52#include <linux/errno.h>
a1f8e7f7 53#include <linux/highmem.h>
1da177e4
LT
54
55#include <linux/socket.h>
56#include <linux/sockios.h>
57#include <linux/in.h>
58#include <linux/inet.h>
59#include <linux/netdevice.h>
60#include <linux/etherdevice.h>
61#include <linux/proc_fs.h>
62#include <linux/stat.h>
63#include <linux/init.h>
64
65#include <net/snmp.h>
66#include <net/ip.h>
67#include <net/protocol.h>
68#include <net/route.h>
cfacb057 69#include <net/xfrm.h>
1da177e4
LT
70#include <linux/skbuff.h>
71#include <net/sock.h>
72#include <net/arp.h>
73#include <net/icmp.h>
1da177e4
LT
74#include <net/checksum.h>
75#include <net/inetpeer.h>
1da177e4
LT
76#include <linux/igmp.h>
77#include <linux/netfilter_ipv4.h>
78#include <linux/netfilter_bridge.h>
79#include <linux/mroute.h>
80#include <linux/netlink.h>
6cbb0df7 81#include <linux/tcp.h>
1da177e4 82
ab32ea5d 83int sysctl_ip_default_ttl __read_mostly = IPDEFTTL;
1da177e4
LT
84
85/* Generate a checksum for an outgoing IP datagram. */
86__inline__ void ip_send_check(struct iphdr *iph)
87{
88 iph->check = 0;
89 iph->check = ip_fast_csum((unsigned char *)iph, iph->ihl);
90}
91
c439cb2e
HX
92int __ip_local_out(struct sk_buff *skb)
93{
94 struct iphdr *iph = ip_hdr(skb);
95
96 iph->tot_len = htons(skb->len);
97 ip_send_check(iph);
adf30907 98 return nf_hook(PF_INET, NF_INET_LOCAL_OUT, skb, NULL, skb_dst(skb)->dev,
c439cb2e
HX
99 dst_output);
100}
101
102int ip_local_out(struct sk_buff *skb)
103{
104 int err;
105
106 err = __ip_local_out(skb);
107 if (likely(err == 1))
108 err = dst_output(skb);
109
110 return err;
111}
112EXPORT_SYMBOL_GPL(ip_local_out);
113
1da177e4
LT
114/* dev_loopback_xmit for use with netfilter. */
115static int ip_dev_loopback_xmit(struct sk_buff *newskb)
116{
459a98ed 117 skb_reset_mac_header(newskb);
bbe735e4 118 __skb_pull(newskb, skb_network_offset(newskb));
1da177e4
LT
119 newskb->pkt_type = PACKET_LOOPBACK;
120 newskb->ip_summed = CHECKSUM_UNNECESSARY;
adf30907 121 WARN_ON(!skb_dst(newskb));
1da177e4
LT
122 netif_rx(newskb);
123 return 0;
124}
125
126static inline int ip_select_ttl(struct inet_sock *inet, struct dst_entry *dst)
127{
128 int ttl = inet->uc_ttl;
129
130 if (ttl < 0)
131 ttl = dst_metric(dst, RTAX_HOPLIMIT);
132 return ttl;
133}
134
e905a9ed 135/*
1da177e4
LT
136 * Add an ip header to a skbuff and send it out.
137 *
138 */
139int ip_build_and_send_pkt(struct sk_buff *skb, struct sock *sk,
13d8eaa0 140 __be32 saddr, __be32 daddr, struct ip_options *opt)
1da177e4
LT
141{
142 struct inet_sock *inet = inet_sk(sk);
511c3f92 143 struct rtable *rt = skb_rtable(skb);
1da177e4
LT
144 struct iphdr *iph;
145
146 /* Build the IP header. */
8856dfa3
ACM
147 skb_push(skb, sizeof(struct iphdr) + (opt ? opt->optlen : 0));
148 skb_reset_network_header(skb);
eddc9ec5 149 iph = ip_hdr(skb);
1da177e4
LT
150 iph->version = 4;
151 iph->ihl = 5;
152 iph->tos = inet->tos;
153 if (ip_dont_fragment(sk, &rt->u.dst))
154 iph->frag_off = htons(IP_DF);
155 else
156 iph->frag_off = 0;
157 iph->ttl = ip_select_ttl(inet, &rt->u.dst);
158 iph->daddr = rt->rt_dst;
159 iph->saddr = rt->rt_src;
160 iph->protocol = sk->sk_protocol;
1da177e4 161 ip_select_ident(iph, &rt->u.dst, sk);
1da177e4
LT
162
163 if (opt && opt->optlen) {
164 iph->ihl += opt->optlen>>2;
165 ip_options_build(skb, opt, daddr, rt, 0);
166 }
1da177e4
LT
167
168 skb->priority = sk->sk_priority;
4a19ec58 169 skb->mark = sk->sk_mark;
1da177e4
LT
170
171 /* Send it out. */
c439cb2e 172 return ip_local_out(skb);
1da177e4
LT
173}
174
d8c97a94
ACM
175EXPORT_SYMBOL_GPL(ip_build_and_send_pkt);
176
1da177e4
LT
177static inline int ip_finish_output2(struct sk_buff *skb)
178{
adf30907 179 struct dst_entry *dst = skb_dst(skb);
80787ebc 180 struct rtable *rt = (struct rtable *)dst;
1da177e4 181 struct net_device *dev = dst->dev;
c2636b4d 182 unsigned int hh_len = LL_RESERVED_SPACE(dev);
1da177e4 183
edf391ff
NH
184 if (rt->rt_type == RTN_MULTICAST) {
185 IP_UPD_PO_STATS(dev_net(dev), IPSTATS_MIB_OUTMCAST, skb->len);
186 } else if (rt->rt_type == RTN_BROADCAST)
187 IP_UPD_PO_STATS(dev_net(dev), IPSTATS_MIB_OUTBCAST, skb->len);
80787ebc 188
1da177e4 189 /* Be paranoid, rather than too clever. */
3b04ddde 190 if (unlikely(skb_headroom(skb) < hh_len && dev->header_ops)) {
1da177e4
LT
191 struct sk_buff *skb2;
192
193 skb2 = skb_realloc_headroom(skb, LL_RESERVED_SPACE(dev));
194 if (skb2 == NULL) {
195 kfree_skb(skb);
196 return -ENOMEM;
197 }
198 if (skb->sk)
199 skb_set_owner_w(skb2, skb->sk);
200 kfree_skb(skb);
201 skb = skb2;
202 }
203
3644f0ce
SH
204 if (dst->hh)
205 return neigh_hh_output(dst->hh, skb);
206 else if (dst->neighbour)
1da177e4
LT
207 return dst->neighbour->output(skb);
208
209 if (net_ratelimit())
210 printk(KERN_DEBUG "ip_finish_output2: No header cache and no neighbour!\n");
211 kfree_skb(skb);
212 return -EINVAL;
213}
214
628a5c56
JH
215static inline int ip_skb_dst_mtu(struct sk_buff *skb)
216{
217 struct inet_sock *inet = skb->sk ? inet_sk(skb->sk) : NULL;
218
219 return (inet && inet->pmtudisc == IP_PMTUDISC_PROBE) ?
adf30907 220 skb_dst(skb)->dev->mtu : dst_mtu(skb_dst(skb));
628a5c56
JH
221}
222
861d0486 223static int ip_finish_output(struct sk_buff *skb)
1da177e4 224{
5c901daa
PM
225#if defined(CONFIG_NETFILTER) && defined(CONFIG_XFRM)
226 /* Policy lookup after SNAT yielded a new policy */
adf30907 227 if (skb_dst(skb)->xfrm != NULL) {
48d5cad8
PM
228 IPCB(skb)->flags |= IPSKB_REROUTED;
229 return dst_output(skb);
230 }
5c901daa 231#endif
628a5c56 232 if (skb->len > ip_skb_dst_mtu(skb) && !skb_is_gso(skb))
1bd9bef6
PM
233 return ip_fragment(skb, ip_finish_output2);
234 else
235 return ip_finish_output2(skb);
1da177e4
LT
236}
237
238int ip_mc_output(struct sk_buff *skb)
239{
240 struct sock *sk = skb->sk;
511c3f92 241 struct rtable *rt = skb_rtable(skb);
1da177e4
LT
242 struct net_device *dev = rt->u.dst.dev;
243
244 /*
245 * If the indicated interface is up and running, send the packet.
246 */
edf391ff 247 IP_UPD_PO_STATS(dev_net(dev), IPSTATS_MIB_OUT, skb->len);
1da177e4
LT
248
249 skb->dev = dev;
250 skb->protocol = htons(ETH_P_IP);
251
252 /*
253 * Multicasts are looped back for other local users
254 */
255
256 if (rt->rt_flags&RTCF_MULTICAST) {
257 if ((!sk || inet_sk(sk)->mc_loop)
258#ifdef CONFIG_IP_MROUTE
259 /* Small optimization: do not loopback not local frames,
260 which returned after forwarding; they will be dropped
261 by ip_mr_input in any case.
262 Note, that local frames are looped back to be delivered
263 to local recipients.
264
265 This check is duplicated in ip_mr_input at the moment.
266 */
9d4fb27d
JP
267 &&
268 ((rt->rt_flags & RTCF_LOCAL) ||
269 !(IPCB(skb)->flags & IPSKB_FORWARDED))
1da177e4 270#endif
9d4fb27d 271 ) {
1da177e4
LT
272 struct sk_buff *newskb = skb_clone(skb, GFP_ATOMIC);
273 if (newskb)
6e23ae2a
PM
274 NF_HOOK(PF_INET, NF_INET_POST_ROUTING, newskb,
275 NULL, newskb->dev,
1da177e4
LT
276 ip_dev_loopback_xmit);
277 }
278
279 /* Multicasts with ttl 0 must not go beyond the host */
280
eddc9ec5 281 if (ip_hdr(skb)->ttl == 0) {
1da177e4
LT
282 kfree_skb(skb);
283 return 0;
284 }
285 }
286
287 if (rt->rt_flags&RTCF_BROADCAST) {
288 struct sk_buff *newskb = skb_clone(skb, GFP_ATOMIC);
289 if (newskb)
6e23ae2a 290 NF_HOOK(PF_INET, NF_INET_POST_ROUTING, newskb, NULL,
1da177e4
LT
291 newskb->dev, ip_dev_loopback_xmit);
292 }
293
6e23ae2a 294 return NF_HOOK_COND(PF_INET, NF_INET_POST_ROUTING, skb, NULL, skb->dev,
48d5cad8
PM
295 ip_finish_output,
296 !(IPCB(skb)->flags & IPSKB_REROUTED));
1da177e4
LT
297}
298
299int ip_output(struct sk_buff *skb)
300{
adf30907 301 struct net_device *dev = skb_dst(skb)->dev;
1bd9bef6 302
edf391ff 303 IP_UPD_PO_STATS(dev_net(dev), IPSTATS_MIB_OUT, skb->len);
1da177e4 304
1bd9bef6
PM
305 skb->dev = dev;
306 skb->protocol = htons(ETH_P_IP);
307
6e23ae2a 308 return NF_HOOK_COND(PF_INET, NF_INET_POST_ROUTING, skb, NULL, dev,
e905a9ed 309 ip_finish_output,
48d5cad8 310 !(IPCB(skb)->flags & IPSKB_REROUTED));
1da177e4
LT
311}
312
e89862f4 313int ip_queue_xmit(struct sk_buff *skb, int ipfragok)
1da177e4 314{
e89862f4 315 struct sock *sk = skb->sk;
1da177e4
LT
316 struct inet_sock *inet = inet_sk(sk);
317 struct ip_options *opt = inet->opt;
318 struct rtable *rt;
319 struct iphdr *iph;
320
321 /* Skip all of this if the packet is already routed,
322 * f.e. by something like SCTP.
323 */
511c3f92 324 rt = skb_rtable(skb);
1da177e4
LT
325 if (rt != NULL)
326 goto packet_routed;
327
328 /* Make sure we can route this packet. */
329 rt = (struct rtable *)__sk_dst_check(sk, 0);
330 if (rt == NULL) {
3ca3c68e 331 __be32 daddr;
1da177e4
LT
332
333 /* Use correct destination address if we have options. */
c720c7e8 334 daddr = inet->inet_daddr;
1da177e4
LT
335 if(opt && opt->srr)
336 daddr = opt->faddr;
337
338 {
339 struct flowi fl = { .oif = sk->sk_bound_dev_if,
914a9ab3 340 .mark = sk->sk_mark,
1da177e4
LT
341 .nl_u = { .ip4_u =
342 { .daddr = daddr,
c720c7e8 343 .saddr = inet->inet_saddr,
1da177e4
LT
344 .tos = RT_CONN_FLAGS(sk) } },
345 .proto = sk->sk_protocol,
86b08d86 346 .flags = inet_sk_flowi_flags(sk),
1da177e4 347 .uli_u = { .ports =
c720c7e8
ED
348 { .sport = inet->inet_sport,
349 .dport = inet->inet_dport } } };
1da177e4
LT
350
351 /* If this fails, retransmit mechanism of transport layer will
352 * keep trying until route appears or the connection times
353 * itself out.
354 */
beb8d13b 355 security_sk_classify_flow(sk, &fl);
3b1e0a65 356 if (ip_route_output_flow(sock_net(sk), &rt, &fl, sk, 0))
1da177e4
LT
357 goto no_route;
358 }
6cbb0df7 359 sk_setup_caps(sk, &rt->u.dst);
1da177e4 360 }
adf30907 361 skb_dst_set(skb, dst_clone(&rt->u.dst));
1da177e4
LT
362
363packet_routed:
364 if (opt && opt->is_strictroute && rt->rt_dst != rt->rt_gateway)
365 goto no_route;
366
367 /* OK, we know where to send it, allocate and build IP header. */
8856dfa3
ACM
368 skb_push(skb, sizeof(struct iphdr) + (opt ? opt->optlen : 0));
369 skb_reset_network_header(skb);
eddc9ec5 370 iph = ip_hdr(skb);
714e85be 371 *((__be16 *)iph) = htons((4 << 12) | (5 << 8) | (inet->tos & 0xff));
1da177e4
LT
372 if (ip_dont_fragment(sk, &rt->u.dst) && !ipfragok)
373 iph->frag_off = htons(IP_DF);
374 else
375 iph->frag_off = 0;
376 iph->ttl = ip_select_ttl(inet, &rt->u.dst);
377 iph->protocol = sk->sk_protocol;
378 iph->saddr = rt->rt_src;
379 iph->daddr = rt->rt_dst;
1da177e4
LT
380 /* Transport layer set skb->h.foo itself. */
381
382 if (opt && opt->optlen) {
383 iph->ihl += opt->optlen >> 2;
c720c7e8 384 ip_options_build(skb, opt, inet->inet_daddr, rt, 0);
1da177e4
LT
385 }
386
89f5f0ae 387 ip_select_ident_more(iph, &rt->u.dst, sk,
7967168c 388 (skb_shinfo(skb)->gso_segs ?: 1) - 1);
1da177e4 389
1da177e4 390 skb->priority = sk->sk_priority;
4a19ec58 391 skb->mark = sk->sk_mark;
1da177e4 392
c439cb2e 393 return ip_local_out(skb);
1da177e4
LT
394
395no_route:
5e38e270 396 IP_INC_STATS(sock_net(sk), IPSTATS_MIB_OUTNOROUTES);
1da177e4
LT
397 kfree_skb(skb);
398 return -EHOSTUNREACH;
399}
400
401
402static void ip_copy_metadata(struct sk_buff *to, struct sk_buff *from)
403{
404 to->pkt_type = from->pkt_type;
405 to->priority = from->priority;
406 to->protocol = from->protocol;
adf30907
ED
407 skb_dst_drop(to);
408 skb_dst_set(to, dst_clone(skb_dst(from)));
1da177e4 409 to->dev = from->dev;
82e91ffe 410 to->mark = from->mark;
1da177e4
LT
411
412 /* Copy the flags to each fragment. */
413 IPCB(to)->flags = IPCB(from)->flags;
414
415#ifdef CONFIG_NET_SCHED
416 to->tc_index = from->tc_index;
417#endif
e7ac05f3 418 nf_copy(to, from);
ba9dda3a
JK
419#if defined(CONFIG_NETFILTER_XT_TARGET_TRACE) || \
420 defined(CONFIG_NETFILTER_XT_TARGET_TRACE_MODULE)
421 to->nf_trace = from->nf_trace;
422#endif
c98d80ed
JA
423#if defined(CONFIG_IP_VS) || defined(CONFIG_IP_VS_MODULE)
424 to->ipvs_property = from->ipvs_property;
1da177e4 425#endif
984bc16c 426 skb_copy_secmark(to, from);
1da177e4
LT
427}
428
429/*
430 * This IP datagram is too large to be sent in one piece. Break it up into
431 * smaller pieces (each of size equal to IP header plus
432 * a block of the data of the original IP data part) that will yet fit in a
433 * single device frame, and queue such a frame for sending.
434 */
435
d9319100 436int ip_fragment(struct sk_buff *skb, int (*output)(struct sk_buff *))
1da177e4
LT
437{
438 struct iphdr *iph;
439 int raw = 0;
440 int ptr;
441 struct net_device *dev;
442 struct sk_buff *skb2;
9bcfcaf5 443 unsigned int mtu, hlen, left, len, ll_rs, pad;
1da177e4 444 int offset;
76ab608d 445 __be16 not_last_frag;
511c3f92 446 struct rtable *rt = skb_rtable(skb);
1da177e4
LT
447 int err = 0;
448
449 dev = rt->u.dst.dev;
450
451 /*
452 * Point into the IP datagram header.
453 */
454
eddc9ec5 455 iph = ip_hdr(skb);
1da177e4
LT
456
457 if (unlikely((iph->frag_off & htons(IP_DF)) && !skb->local_df)) {
5e38e270 458 IP_INC_STATS(dev_net(dev), IPSTATS_MIB_FRAGFAILS);
1da177e4 459 icmp_send(skb, ICMP_DEST_UNREACH, ICMP_FRAG_NEEDED,
628a5c56 460 htonl(ip_skb_dst_mtu(skb)));
1da177e4
LT
461 kfree_skb(skb);
462 return -EMSGSIZE;
463 }
464
465 /*
466 * Setup starting values.
467 */
468
469 hlen = iph->ihl * 4;
470 mtu = dst_mtu(&rt->u.dst) - hlen; /* Size of data space */
89cee8b1 471 IPCB(skb)->flags |= IPSKB_FRAG_COMPLETE;
1da177e4
LT
472
473 /* When frag_list is given, use it. First, check its validity:
474 * some transformers could create wrong frag_list or break existing
475 * one, it is not prohibited. In this case fall back to copying.
476 *
477 * LATER: this step can be merged to real generation of fragments,
478 * we can switch to copy when see the first bad fragment.
479 */
d7fcf1a5 480 if (skb_has_frags(skb)) {
1da177e4
LT
481 struct sk_buff *frag;
482 int first_len = skb_pagelen(skb);
29ffe1a5 483 int truesizes = 0;
1da177e4
LT
484
485 if (first_len - hlen > mtu ||
486 ((first_len - hlen) & 7) ||
487 (iph->frag_off & htons(IP_MF|IP_OFFSET)) ||
488 skb_cloned(skb))
489 goto slow_path;
490
d7fcf1a5 491 skb_walk_frags(skb, frag) {
1da177e4
LT
492 /* Correct geometry. */
493 if (frag->len > mtu ||
494 ((frag->len & 7) && frag->next) ||
495 skb_headroom(frag) < hlen)
496 goto slow_path;
497
498 /* Partially cloned skb? */
499 if (skb_shared(frag))
500 goto slow_path;
2fdba6b0
HX
501
502 BUG_ON(frag->sk);
503 if (skb->sk) {
2fdba6b0
HX
504 frag->sk = skb->sk;
505 frag->destructor = sock_wfree;
2fdba6b0 506 }
b2722b1c 507 truesizes += frag->truesize;
1da177e4
LT
508 }
509
510 /* Everything is OK. Generate! */
511
512 err = 0;
513 offset = 0;
514 frag = skb_shinfo(skb)->frag_list;
d7fcf1a5 515 skb_frag_list_init(skb);
1da177e4 516 skb->data_len = first_len - skb_headlen(skb);
29ffe1a5 517 skb->truesize -= truesizes;
1da177e4
LT
518 skb->len = first_len;
519 iph->tot_len = htons(first_len);
520 iph->frag_off = htons(IP_MF);
521 ip_send_check(iph);
522
523 for (;;) {
524 /* Prepare header of the next frame,
525 * before previous one went down. */
526 if (frag) {
527 frag->ip_summed = CHECKSUM_NONE;
badff6d0 528 skb_reset_transport_header(frag);
e2d1bca7
ACM
529 __skb_push(frag, hlen);
530 skb_reset_network_header(frag);
d56f90a7 531 memcpy(skb_network_header(frag), iph, hlen);
eddc9ec5 532 iph = ip_hdr(frag);
1da177e4
LT
533 iph->tot_len = htons(frag->len);
534 ip_copy_metadata(frag, skb);
535 if (offset == 0)
536 ip_options_fragment(frag);
537 offset += skb->len - hlen;
538 iph->frag_off = htons(offset>>3);
539 if (frag->next != NULL)
540 iph->frag_off |= htons(IP_MF);
541 /* Ready, complete checksum */
542 ip_send_check(iph);
543 }
544
545 err = output(skb);
546
dafee490 547 if (!err)
5e38e270 548 IP_INC_STATS(dev_net(dev), IPSTATS_MIB_FRAGCREATES);
1da177e4
LT
549 if (err || !frag)
550 break;
551
552 skb = frag;
553 frag = skb->next;
554 skb->next = NULL;
555 }
556
557 if (err == 0) {
5e38e270 558 IP_INC_STATS(dev_net(dev), IPSTATS_MIB_FRAGOKS);
1da177e4
LT
559 return 0;
560 }
561
562 while (frag) {
563 skb = frag->next;
564 kfree_skb(frag);
565 frag = skb;
566 }
5e38e270 567 IP_INC_STATS(dev_net(dev), IPSTATS_MIB_FRAGFAILS);
1da177e4
LT
568 return err;
569 }
570
571slow_path:
572 left = skb->len - hlen; /* Space per frame */
573 ptr = raw + hlen; /* Where to start from */
574
1da177e4 575 /* for bridged IP traffic encapsulated inside f.e. a vlan header,
9bcfcaf5
SH
576 * we need to make room for the encapsulating header
577 */
578 pad = nf_bridge_pad(skb);
579 ll_rs = LL_RESERVED_SPACE_EXTRA(rt->u.dst.dev, pad);
580 mtu -= pad;
581
1da177e4
LT
582 /*
583 * Fragment the datagram.
584 */
585
586 offset = (ntohs(iph->frag_off) & IP_OFFSET) << 3;
587 not_last_frag = iph->frag_off & htons(IP_MF);
588
589 /*
590 * Keep copying data until we run out.
591 */
592
132adf54 593 while (left > 0) {
1da177e4
LT
594 len = left;
595 /* IF: it doesn't fit, use 'mtu' - the data space left */
596 if (len > mtu)
597 len = mtu;
598 /* IF: we are not sending upto and including the packet end
599 then align the next start on an eight byte boundary */
600 if (len < left) {
601 len &= ~7;
602 }
603 /*
604 * Allocate buffer.
605 */
606
607 if ((skb2 = alloc_skb(len+hlen+ll_rs, GFP_ATOMIC)) == NULL) {
64ce2073 608 NETDEBUG(KERN_INFO "IP: frag: no memory for new fragment!\n");
1da177e4
LT
609 err = -ENOMEM;
610 goto fail;
611 }
612
613 /*
614 * Set up data on packet
615 */
616
617 ip_copy_metadata(skb2, skb);
618 skb_reserve(skb2, ll_rs);
619 skb_put(skb2, len + hlen);
c1d2bbe1 620 skb_reset_network_header(skb2);
b0e380b1 621 skb2->transport_header = skb2->network_header + hlen;
1da177e4
LT
622
623 /*
624 * Charge the memory for the fragment to any owner
625 * it might possess
626 */
627
628 if (skb->sk)
629 skb_set_owner_w(skb2, skb->sk);
630
631 /*
632 * Copy the packet header into the new buffer.
633 */
634
d626f62b 635 skb_copy_from_linear_data(skb, skb_network_header(skb2), hlen);
1da177e4
LT
636
637 /*
638 * Copy a block of the IP datagram.
639 */
bff9b61c 640 if (skb_copy_bits(skb, ptr, skb_transport_header(skb2), len))
1da177e4
LT
641 BUG();
642 left -= len;
643
644 /*
645 * Fill in the new header fields.
646 */
eddc9ec5 647 iph = ip_hdr(skb2);
1da177e4
LT
648 iph->frag_off = htons((offset >> 3));
649
650 /* ANK: dirty, but effective trick. Upgrade options only if
651 * the segment to be fragmented was THE FIRST (otherwise,
652 * options are already fixed) and make it ONCE
653 * on the initial skb, so that all the following fragments
654 * will inherit fixed options.
655 */
656 if (offset == 0)
657 ip_options_fragment(skb);
658
659 /*
660 * Added AC : If we are fragmenting a fragment that's not the
661 * last fragment then keep MF on each bit
662 */
663 if (left > 0 || not_last_frag)
664 iph->frag_off |= htons(IP_MF);
665 ptr += len;
666 offset += len;
667
668 /*
669 * Put this fragment into the sending queue.
670 */
1da177e4
LT
671 iph->tot_len = htons(len + hlen);
672
673 ip_send_check(iph);
674
675 err = output(skb2);
676 if (err)
677 goto fail;
dafee490 678
5e38e270 679 IP_INC_STATS(dev_net(dev), IPSTATS_MIB_FRAGCREATES);
1da177e4
LT
680 }
681 kfree_skb(skb);
5e38e270 682 IP_INC_STATS(dev_net(dev), IPSTATS_MIB_FRAGOKS);
1da177e4
LT
683 return err;
684
685fail:
e905a9ed 686 kfree_skb(skb);
5e38e270 687 IP_INC_STATS(dev_net(dev), IPSTATS_MIB_FRAGFAILS);
1da177e4
LT
688 return err;
689}
690
2e2f7aef
PM
691EXPORT_SYMBOL(ip_fragment);
692
1da177e4
LT
693int
694ip_generic_getfrag(void *from, char *to, int offset, int len, int odd, struct sk_buff *skb)
695{
696 struct iovec *iov = from;
697
84fa7933 698 if (skb->ip_summed == CHECKSUM_PARTIAL) {
1da177e4
LT
699 if (memcpy_fromiovecend(to, iov, offset, len) < 0)
700 return -EFAULT;
701 } else {
44bb9363 702 __wsum csum = 0;
1da177e4
LT
703 if (csum_partial_copy_fromiovecend(to, iov, offset, len, &csum) < 0)
704 return -EFAULT;
705 skb->csum = csum_block_add(skb->csum, csum, odd);
706 }
707 return 0;
708}
709
44bb9363 710static inline __wsum
1da177e4
LT
711csum_page(struct page *page, int offset, int copy)
712{
713 char *kaddr;
44bb9363 714 __wsum csum;
1da177e4
LT
715 kaddr = kmap(page);
716 csum = csum_partial(kaddr + offset, copy, 0);
717 kunmap(page);
718 return csum;
719}
720
4b30b1c6 721static inline int ip_ufo_append_data(struct sock *sk,
e89e9cf5
AR
722 int getfrag(void *from, char *to, int offset, int len,
723 int odd, struct sk_buff *skb),
724 void *from, int length, int hh_len, int fragheaderlen,
d9319100 725 int transhdrlen, int mtu, unsigned int flags)
e89e9cf5
AR
726{
727 struct sk_buff *skb;
728 int err;
729
730 /* There is support for UDP fragmentation offload by network
731 * device, so create one single skb packet containing complete
732 * udp datagram
733 */
734 if ((skb = skb_peek_tail(&sk->sk_write_queue)) == NULL) {
735 skb = sock_alloc_send_skb(sk,
736 hh_len + fragheaderlen + transhdrlen + 20,
737 (flags & MSG_DONTWAIT), &err);
738
739 if (skb == NULL)
740 return err;
741
742 /* reserve space for Hardware header */
743 skb_reserve(skb, hh_len);
744
745 /* create space for UDP/IP header */
d9319100 746 skb_put(skb, fragheaderlen + transhdrlen);
e89e9cf5
AR
747
748 /* initialize network header pointer */
c1d2bbe1 749 skb_reset_network_header(skb);
e89e9cf5
AR
750
751 /* initialize protocol header pointer */
b0e380b1 752 skb->transport_header = skb->network_header + fragheaderlen;
e89e9cf5 753
84fa7933 754 skb->ip_summed = CHECKSUM_PARTIAL;
e89e9cf5
AR
755 skb->csum = 0;
756 sk->sk_sndmsg_off = 0;
e89e9cf5 757
be9164e7 758 /* specify the length of each IP datagram fragment */
7967168c 759 skb_shinfo(skb)->gso_size = mtu - fragheaderlen;
f83ef8c0 760 skb_shinfo(skb)->gso_type = SKB_GSO_UDP;
e89e9cf5 761 __skb_queue_tail(&sk->sk_write_queue, skb);
e89e9cf5 762 }
be9164e7
K
763
764 return skb_append_datato_frags(sk, skb, getfrag, from,
765 (length - transhdrlen));
e89e9cf5
AR
766}
767
1da177e4
LT
768/*
769 * ip_append_data() and ip_append_page() can make one large IP datagram
770 * from many pieces of data. Each pieces will be holded on the socket
771 * until ip_push_pending_frames() is called. Each piece can be a page
772 * or non-page data.
e905a9ed 773 *
1da177e4
LT
774 * Not only UDP, other transport protocols - e.g. raw sockets - can use
775 * this interface potentially.
776 *
777 * LATER: length must be adjusted by pad at tail, when it is required.
778 */
779int ip_append_data(struct sock *sk,
780 int getfrag(void *from, char *to, int offset, int len,
781 int odd, struct sk_buff *skb),
782 void *from, int length, int transhdrlen,
2e77d89b 783 struct ipcm_cookie *ipc, struct rtable **rtp,
1da177e4
LT
784 unsigned int flags)
785{
786 struct inet_sock *inet = inet_sk(sk);
787 struct sk_buff *skb;
788
789 struct ip_options *opt = NULL;
790 int hh_len;
791 int exthdrlen;
792 int mtu;
793 int copy;
794 int err;
795 int offset = 0;
796 unsigned int maxfraglen, fragheaderlen;
797 int csummode = CHECKSUM_NONE;
2e77d89b 798 struct rtable *rt;
1da177e4
LT
799
800 if (flags&MSG_PROBE)
801 return 0;
802
803 if (skb_queue_empty(&sk->sk_write_queue)) {
804 /*
805 * setup for corking.
806 */
807 opt = ipc->opt;
808 if (opt) {
809 if (inet->cork.opt == NULL) {
810 inet->cork.opt = kmalloc(sizeof(struct ip_options) + 40, sk->sk_allocation);
811 if (unlikely(inet->cork.opt == NULL))
812 return -ENOBUFS;
813 }
814 memcpy(inet->cork.opt, opt, sizeof(struct ip_options)+opt->optlen);
815 inet->cork.flags |= IPCORK_OPT;
816 inet->cork.addr = ipc->addr;
817 }
2e77d89b 818 rt = *rtp;
788d908f
JT
819 if (unlikely(!rt))
820 return -EFAULT;
2e77d89b
ED
821 /*
822 * We steal reference to this route, caller should not release it
823 */
824 *rtp = NULL;
628a5c56
JH
825 inet->cork.fragsize = mtu = inet->pmtudisc == IP_PMTUDISC_PROBE ?
826 rt->u.dst.dev->mtu :
827 dst_mtu(rt->u.dst.path);
c8cdaf99 828 inet->cork.dst = &rt->u.dst;
1da177e4
LT
829 inet->cork.length = 0;
830 sk->sk_sndmsg_page = NULL;
831 sk->sk_sndmsg_off = 0;
832 if ((exthdrlen = rt->u.dst.header_len) != 0) {
833 length += exthdrlen;
834 transhdrlen += exthdrlen;
835 }
836 } else {
c8cdaf99 837 rt = (struct rtable *)inet->cork.dst;
1da177e4
LT
838 if (inet->cork.flags & IPCORK_OPT)
839 opt = inet->cork.opt;
840
841 transhdrlen = 0;
842 exthdrlen = 0;
843 mtu = inet->cork.fragsize;
844 }
845 hh_len = LL_RESERVED_SPACE(rt->u.dst.dev);
846
847 fragheaderlen = sizeof(struct iphdr) + (opt ? opt->optlen : 0);
848 maxfraglen = ((mtu - fragheaderlen) & ~7) + fragheaderlen;
849
850 if (inet->cork.length + length > 0xFFFF - fragheaderlen) {
c720c7e8
ED
851 ip_local_error(sk, EMSGSIZE, rt->rt_dst, inet->inet_dport,
852 mtu-exthdrlen);
1da177e4
LT
853 return -EMSGSIZE;
854 }
855
856 /*
857 * transhdrlen > 0 means that this is the first fragment and we wish
858 * it won't be fragmented in the future.
859 */
860 if (transhdrlen &&
861 length + fragheaderlen <= mtu &&
d212f87b 862 rt->u.dst.dev->features & NETIF_F_V4_CSUM &&
1da177e4 863 !exthdrlen)
84fa7933 864 csummode = CHECKSUM_PARTIAL;
1da177e4
LT
865
866 inet->cork.length += length;
be9164e7
K
867 if (((length> mtu) || !skb_queue_empty(&sk->sk_write_queue)) &&
868 (sk->sk_protocol == IPPROTO_UDP) &&
869 (rt->u.dst.dev->features & NETIF_F_UFO)) {
baa829d8
PM
870 err = ip_ufo_append_data(sk, getfrag, from, length, hh_len,
871 fragheaderlen, transhdrlen, mtu,
872 flags);
873 if (err)
e89e9cf5 874 goto error;
e89e9cf5
AR
875 return 0;
876 }
1da177e4
LT
877
878 /* So, what's going on in the loop below?
879 *
880 * We use calculated fragment length to generate chained skb,
881 * each of segments is IP fragment ready for sending to network after
882 * adding appropriate IP header.
883 */
884
885 if ((skb = skb_peek_tail(&sk->sk_write_queue)) == NULL)
886 goto alloc_new_skb;
887
888 while (length > 0) {
889 /* Check if the remaining data fits into current packet. */
890 copy = mtu - skb->len;
891 if (copy < length)
892 copy = maxfraglen - skb->len;
893 if (copy <= 0) {
894 char *data;
895 unsigned int datalen;
896 unsigned int fraglen;
897 unsigned int fraggap;
898 unsigned int alloclen;
899 struct sk_buff *skb_prev;
900alloc_new_skb:
901 skb_prev = skb;
902 if (skb_prev)
903 fraggap = skb_prev->len - maxfraglen;
904 else
905 fraggap = 0;
906
907 /*
908 * If remaining data exceeds the mtu,
909 * we know we need more fragment(s).
910 */
911 datalen = length + fraggap;
912 if (datalen > mtu - fragheaderlen)
913 datalen = maxfraglen - fragheaderlen;
914 fraglen = datalen + fragheaderlen;
915
e905a9ed 916 if ((flags & MSG_MORE) &&
1da177e4
LT
917 !(rt->u.dst.dev->features&NETIF_F_SG))
918 alloclen = mtu;
919 else
920 alloclen = datalen + fragheaderlen;
921
922 /* The last fragment gets additional space at tail.
923 * Note, with MSG_MORE we overallocate on fragments,
924 * because we have no idea what fragment will be
925 * the last.
926 */
3d9dd756 927 if (datalen == length + fraggap)
1da177e4
LT
928 alloclen += rt->u.dst.trailer_len;
929
930 if (transhdrlen) {
e905a9ed 931 skb = sock_alloc_send_skb(sk,
1da177e4
LT
932 alloclen + hh_len + 15,
933 (flags & MSG_DONTWAIT), &err);
934 } else {
935 skb = NULL;
936 if (atomic_read(&sk->sk_wmem_alloc) <=
937 2 * sk->sk_sndbuf)
e905a9ed 938 skb = sock_wmalloc(sk,
1da177e4
LT
939 alloclen + hh_len + 15, 1,
940 sk->sk_allocation);
941 if (unlikely(skb == NULL))
942 err = -ENOBUFS;
51f31cab
PO
943 else
944 /* only the initial fragment is
945 time stamped */
946 ipc->shtx.flags = 0;
1da177e4
LT
947 }
948 if (skb == NULL)
949 goto error;
950
951 /*
952 * Fill in the control structures
953 */
954 skb->ip_summed = csummode;
955 skb->csum = 0;
956 skb_reserve(skb, hh_len);
51f31cab 957 *skb_tx(skb) = ipc->shtx;
1da177e4
LT
958
959 /*
960 * Find where to start putting bytes.
961 */
962 data = skb_put(skb, fraglen);
c14d2450 963 skb_set_network_header(skb, exthdrlen);
b0e380b1
ACM
964 skb->transport_header = (skb->network_header +
965 fragheaderlen);
1da177e4 966 data += fragheaderlen;
1da177e4
LT
967
968 if (fraggap) {
969 skb->csum = skb_copy_and_csum_bits(
970 skb_prev, maxfraglen,
971 data + transhdrlen, fraggap, 0);
972 skb_prev->csum = csum_sub(skb_prev->csum,
973 skb->csum);
974 data += fraggap;
e9fa4f7b 975 pskb_trim_unique(skb_prev, maxfraglen);
1da177e4
LT
976 }
977
978 copy = datalen - transhdrlen - fraggap;
979 if (copy > 0 && getfrag(from, data + transhdrlen, offset, copy, fraggap, skb) < 0) {
980 err = -EFAULT;
981 kfree_skb(skb);
982 goto error;
983 }
984
985 offset += copy;
986 length -= datalen - fraggap;
987 transhdrlen = 0;
988 exthdrlen = 0;
989 csummode = CHECKSUM_NONE;
990
991 /*
992 * Put the packet on the pending queue.
993 */
994 __skb_queue_tail(&sk->sk_write_queue, skb);
995 continue;
996 }
997
998 if (copy > length)
999 copy = length;
1000
1001 if (!(rt->u.dst.dev->features&NETIF_F_SG)) {
1002 unsigned int off;
1003
1004 off = skb->len;
e905a9ed 1005 if (getfrag(from, skb_put(skb, copy),
1da177e4
LT
1006 offset, copy, off, skb) < 0) {
1007 __skb_trim(skb, off);
1008 err = -EFAULT;
1009 goto error;
1010 }
1011 } else {
1012 int i = skb_shinfo(skb)->nr_frags;
1013 skb_frag_t *frag = &skb_shinfo(skb)->frags[i-1];
1014 struct page *page = sk->sk_sndmsg_page;
1015 int off = sk->sk_sndmsg_off;
1016 unsigned int left;
1017
1018 if (page && (left = PAGE_SIZE - off) > 0) {
1019 if (copy >= left)
1020 copy = left;
1021 if (page != frag->page) {
1022 if (i == MAX_SKB_FRAGS) {
1023 err = -EMSGSIZE;
1024 goto error;
1025 }
1026 get_page(page);
e905a9ed 1027 skb_fill_page_desc(skb, i, page, sk->sk_sndmsg_off, 0);
1da177e4
LT
1028 frag = &skb_shinfo(skb)->frags[i];
1029 }
1030 } else if (i < MAX_SKB_FRAGS) {
1031 if (copy > PAGE_SIZE)
1032 copy = PAGE_SIZE;
1033 page = alloc_pages(sk->sk_allocation, 0);
1034 if (page == NULL) {
1035 err = -ENOMEM;
1036 goto error;
1037 }
1038 sk->sk_sndmsg_page = page;
1039 sk->sk_sndmsg_off = 0;
1040
1041 skb_fill_page_desc(skb, i, page, 0, 0);
1042 frag = &skb_shinfo(skb)->frags[i];
1da177e4
LT
1043 } else {
1044 err = -EMSGSIZE;
1045 goto error;
1046 }
1047 if (getfrag(from, page_address(frag->page)+frag->page_offset+frag->size, offset, copy, skb->len, skb) < 0) {
1048 err = -EFAULT;
1049 goto error;
1050 }
1051 sk->sk_sndmsg_off += copy;
1052 frag->size += copy;
1053 skb->len += copy;
1054 skb->data_len += copy;
f945fa7a
HX
1055 skb->truesize += copy;
1056 atomic_add(copy, &sk->sk_wmem_alloc);
1da177e4
LT
1057 }
1058 offset += copy;
1059 length -= copy;
1060 }
1061
1062 return 0;
1063
1064error:
1065 inet->cork.length -= length;
5e38e270 1066 IP_INC_STATS(sock_net(sk), IPSTATS_MIB_OUTDISCARDS);
e905a9ed 1067 return err;
1da177e4
LT
1068}
1069
1070ssize_t ip_append_page(struct sock *sk, struct page *page,
1071 int offset, size_t size, int flags)
1072{
1073 struct inet_sock *inet = inet_sk(sk);
1074 struct sk_buff *skb;
1075 struct rtable *rt;
1076 struct ip_options *opt = NULL;
1077 int hh_len;
1078 int mtu;
1079 int len;
1080 int err;
1081 unsigned int maxfraglen, fragheaderlen, fraggap;
1082
1083 if (inet->hdrincl)
1084 return -EPERM;
1085
1086 if (flags&MSG_PROBE)
1087 return 0;
1088
1089 if (skb_queue_empty(&sk->sk_write_queue))
1090 return -EINVAL;
1091
c8cdaf99 1092 rt = (struct rtable *)inet->cork.dst;
1da177e4
LT
1093 if (inet->cork.flags & IPCORK_OPT)
1094 opt = inet->cork.opt;
1095
1096 if (!(rt->u.dst.dev->features&NETIF_F_SG))
1097 return -EOPNOTSUPP;
1098
1099 hh_len = LL_RESERVED_SPACE(rt->u.dst.dev);
1100 mtu = inet->cork.fragsize;
1101
1102 fragheaderlen = sizeof(struct iphdr) + (opt ? opt->optlen : 0);
1103 maxfraglen = ((mtu - fragheaderlen) & ~7) + fragheaderlen;
1104
1105 if (inet->cork.length + size > 0xFFFF - fragheaderlen) {
c720c7e8 1106 ip_local_error(sk, EMSGSIZE, rt->rt_dst, inet->inet_dport, mtu);
1da177e4
LT
1107 return -EMSGSIZE;
1108 }
1109
1110 if ((skb = skb_peek_tail(&sk->sk_write_queue)) == NULL)
1111 return -EINVAL;
1112
1113 inet->cork.length += size;
e89e9cf5 1114 if ((sk->sk_protocol == IPPROTO_UDP) &&
7967168c
HX
1115 (rt->u.dst.dev->features & NETIF_F_UFO)) {
1116 skb_shinfo(skb)->gso_size = mtu - fragheaderlen;
f83ef8c0 1117 skb_shinfo(skb)->gso_type = SKB_GSO_UDP;
7967168c 1118 }
e89e9cf5 1119
1da177e4
LT
1120
1121 while (size > 0) {
1122 int i;
1123
89114afd 1124 if (skb_is_gso(skb))
e89e9cf5
AR
1125 len = size;
1126 else {
1127
1128 /* Check if the remaining data fits into current packet. */
1129 len = mtu - skb->len;
1130 if (len < size)
1131 len = maxfraglen - skb->len;
1132 }
1da177e4
LT
1133 if (len <= 0) {
1134 struct sk_buff *skb_prev;
1da177e4
LT
1135 int alloclen;
1136
1137 skb_prev = skb;
0d0d2bba 1138 fraggap = skb_prev->len - maxfraglen;
1da177e4
LT
1139
1140 alloclen = fragheaderlen + hh_len + fraggap + 15;
1141 skb = sock_wmalloc(sk, alloclen, 1, sk->sk_allocation);
1142 if (unlikely(!skb)) {
1143 err = -ENOBUFS;
1144 goto error;
1145 }
1146
1147 /*
1148 * Fill in the control structures
1149 */
1150 skb->ip_summed = CHECKSUM_NONE;
1151 skb->csum = 0;
1152 skb_reserve(skb, hh_len);
1153
1154 /*
1155 * Find where to start putting bytes.
1156 */
967b05f6 1157 skb_put(skb, fragheaderlen + fraggap);
2ca9e6f2 1158 skb_reset_network_header(skb);
b0e380b1
ACM
1159 skb->transport_header = (skb->network_header +
1160 fragheaderlen);
1da177e4 1161 if (fraggap) {
967b05f6
ACM
1162 skb->csum = skb_copy_and_csum_bits(skb_prev,
1163 maxfraglen,
9c70220b 1164 skb_transport_header(skb),
967b05f6 1165 fraggap, 0);
1da177e4
LT
1166 skb_prev->csum = csum_sub(skb_prev->csum,
1167 skb->csum);
e9fa4f7b 1168 pskb_trim_unique(skb_prev, maxfraglen);
1da177e4
LT
1169 }
1170
1171 /*
1172 * Put the packet on the pending queue.
1173 */
1174 __skb_queue_tail(&sk->sk_write_queue, skb);
1175 continue;
1176 }
1177
1178 i = skb_shinfo(skb)->nr_frags;
1179 if (len > size)
1180 len = size;
1181 if (skb_can_coalesce(skb, i, page, offset)) {
1182 skb_shinfo(skb)->frags[i-1].size += len;
1183 } else if (i < MAX_SKB_FRAGS) {
1184 get_page(page);
1185 skb_fill_page_desc(skb, i, page, offset, len);
1186 } else {
1187 err = -EMSGSIZE;
1188 goto error;
1189 }
1190
1191 if (skb->ip_summed == CHECKSUM_NONE) {
44bb9363 1192 __wsum csum;
1da177e4
LT
1193 csum = csum_page(page, offset, len);
1194 skb->csum = csum_block_add(skb->csum, csum, skb->len);
1195 }
1196
1197 skb->len += len;
1198 skb->data_len += len;
1e34a11d
DM
1199 skb->truesize += len;
1200 atomic_add(len, &sk->sk_wmem_alloc);
1da177e4
LT
1201 offset += len;
1202 size -= len;
1203 }
1204 return 0;
1205
1206error:
1207 inet->cork.length -= size;
5e38e270 1208 IP_INC_STATS(sock_net(sk), IPSTATS_MIB_OUTDISCARDS);
1da177e4
LT
1209 return err;
1210}
1211
429f08e9
PE
1212static void ip_cork_release(struct inet_sock *inet)
1213{
1214 inet->cork.flags &= ~IPCORK_OPT;
1215 kfree(inet->cork.opt);
1216 inet->cork.opt = NULL;
c8cdaf99
YH
1217 dst_release(inet->cork.dst);
1218 inet->cork.dst = NULL;
429f08e9
PE
1219}
1220
1da177e4
LT
1221/*
1222 * Combined all pending IP fragments on the socket as one IP datagram
1223 * and push them out.
1224 */
1225int ip_push_pending_frames(struct sock *sk)
1226{
1227 struct sk_buff *skb, *tmp_skb;
1228 struct sk_buff **tail_skb;
1229 struct inet_sock *inet = inet_sk(sk);
0388b004 1230 struct net *net = sock_net(sk);
1da177e4 1231 struct ip_options *opt = NULL;
c8cdaf99 1232 struct rtable *rt = (struct rtable *)inet->cork.dst;
1da177e4 1233 struct iphdr *iph;
76ab608d 1234 __be16 df = 0;
1da177e4
LT
1235 __u8 ttl;
1236 int err = 0;
1237
1238 if ((skb = __skb_dequeue(&sk->sk_write_queue)) == NULL)
1239 goto out;
1240 tail_skb = &(skb_shinfo(skb)->frag_list);
1241
1242 /* move skb->data to ip header from ext header */
d56f90a7 1243 if (skb->data < skb_network_header(skb))
bbe735e4 1244 __skb_pull(skb, skb_network_offset(skb));
1da177e4 1245 while ((tmp_skb = __skb_dequeue(&sk->sk_write_queue)) != NULL) {
cfe1fc77 1246 __skb_pull(tmp_skb, skb_network_header_len(skb));
1da177e4
LT
1247 *tail_skb = tmp_skb;
1248 tail_skb = &(tmp_skb->next);
1249 skb->len += tmp_skb->len;
1250 skb->data_len += tmp_skb->len;
1251 skb->truesize += tmp_skb->truesize;
1da177e4
LT
1252 tmp_skb->destructor = NULL;
1253 tmp_skb->sk = NULL;
1254 }
1255
1256 /* Unless user demanded real pmtu discovery (IP_PMTUDISC_DO), we allow
1257 * to fragment the frame generated here. No matter, what transforms
1258 * how transforms change size of the packet, it will come out.
1259 */
628a5c56 1260 if (inet->pmtudisc < IP_PMTUDISC_DO)
1da177e4
LT
1261 skb->local_df = 1;
1262
1263 /* DF bit is set when we want to see DF on outgoing frames.
1264 * If local_df is set too, we still allow to fragment this frame
1265 * locally. */
628a5c56 1266 if (inet->pmtudisc >= IP_PMTUDISC_DO ||
1da177e4
LT
1267 (skb->len <= dst_mtu(&rt->u.dst) &&
1268 ip_dont_fragment(sk, &rt->u.dst)))
1269 df = htons(IP_DF);
1270
1271 if (inet->cork.flags & IPCORK_OPT)
1272 opt = inet->cork.opt;
1273
1274 if (rt->rt_type == RTN_MULTICAST)
1275 ttl = inet->mc_ttl;
1276 else
1277 ttl = ip_select_ttl(inet, &rt->u.dst);
1278
1279 iph = (struct iphdr *)skb->data;
1280 iph->version = 4;
1281 iph->ihl = 5;
1282 if (opt) {
1283 iph->ihl += opt->optlen>>2;
1284 ip_options_build(skb, opt, inet->cork.addr, rt, 0);
1285 }
1286 iph->tos = inet->tos;
1da177e4 1287 iph->frag_off = df;
1a55d57b 1288 ip_select_ident(iph, &rt->u.dst, sk);
1da177e4
LT
1289 iph->ttl = ttl;
1290 iph->protocol = sk->sk_protocol;
1291 iph->saddr = rt->rt_src;
1292 iph->daddr = rt->rt_dst;
1da177e4
LT
1293
1294 skb->priority = sk->sk_priority;
4a19ec58 1295 skb->mark = sk->sk_mark;
a21bba94
ED
1296 /*
1297 * Steal rt from cork.dst to avoid a pair of atomic_inc/atomic_dec
1298 * on dst refcount
1299 */
1300 inet->cork.dst = NULL;
adf30907 1301 skb_dst_set(skb, &rt->u.dst);
1da177e4 1302
96793b48 1303 if (iph->protocol == IPPROTO_ICMP)
0388b004 1304 icmp_out_count(net, ((struct icmphdr *)
96793b48
DS
1305 skb_transport_header(skb))->type);
1306
1da177e4 1307 /* Netfilter gets whole the not fragmented skb. */
c439cb2e 1308 err = ip_local_out(skb);
1da177e4
LT
1309 if (err) {
1310 if (err > 0)
6ce9e7b5 1311 err = net_xmit_errno(err);
1da177e4
LT
1312 if (err)
1313 goto error;
1314 }
1315
1316out:
429f08e9 1317 ip_cork_release(inet);
1da177e4
LT
1318 return err;
1319
1320error:
5e38e270 1321 IP_INC_STATS(net, IPSTATS_MIB_OUTDISCARDS);
1da177e4
LT
1322 goto out;
1323}
1324
1325/*
1326 * Throw away all pending data on the socket.
1327 */
1328void ip_flush_pending_frames(struct sock *sk)
1329{
1da177e4
LT
1330 struct sk_buff *skb;
1331
1332 while ((skb = __skb_dequeue_tail(&sk->sk_write_queue)) != NULL)
1333 kfree_skb(skb);
1334
429f08e9 1335 ip_cork_release(inet_sk(sk));
1da177e4
LT
1336}
1337
1338
1339/*
1340 * Fetch data from kernel space and fill in checksum if needed.
1341 */
e905a9ed 1342static int ip_reply_glue_bits(void *dptr, char *to, int offset,
1da177e4
LT
1343 int len, int odd, struct sk_buff *skb)
1344{
5084205f 1345 __wsum csum;
1da177e4
LT
1346
1347 csum = csum_partial_copy_nocheck(dptr+offset, to, len, 0);
1348 skb->csum = csum_block_add(skb->csum, csum, odd);
e905a9ed 1349 return 0;
1da177e4
LT
1350}
1351
e905a9ed 1352/*
1da177e4
LT
1353 * Generic function to send a packet as reply to another packet.
1354 * Used to send TCP resets so far. ICMP should use this function too.
1355 *
e905a9ed 1356 * Should run single threaded per socket because it uses the sock
1da177e4 1357 * structure to pass arguments.
1da177e4
LT
1358 */
1359void ip_send_reply(struct sock *sk, struct sk_buff *skb, struct ip_reply_arg *arg,
1360 unsigned int len)
1361{
1362 struct inet_sock *inet = inet_sk(sk);
1363 struct {
1364 struct ip_options opt;
1365 char data[40];
1366 } replyopts;
1367 struct ipcm_cookie ipc;
3ca3c68e 1368 __be32 daddr;
511c3f92 1369 struct rtable *rt = skb_rtable(skb);
1da177e4
LT
1370
1371 if (ip_options_echo(&replyopts.opt, skb))
1372 return;
1373
1374 daddr = ipc.addr = rt->rt_src;
1375 ipc.opt = NULL;
51f31cab 1376 ipc.shtx.flags = 0;
1da177e4
LT
1377
1378 if (replyopts.opt.optlen) {
1379 ipc.opt = &replyopts.opt;
1380
1381 if (ipc.opt->srr)
1382 daddr = replyopts.opt.faddr;
1383 }
1384
1385 {
f0e48dbf
PM
1386 struct flowi fl = { .oif = arg->bound_dev_if,
1387 .nl_u = { .ip4_u =
1da177e4
LT
1388 { .daddr = daddr,
1389 .saddr = rt->rt_spec_dst,
eddc9ec5 1390 .tos = RT_TOS(ip_hdr(skb)->tos) } },
1da177e4
LT
1391 /* Not quite clean, but right. */
1392 .uli_u = { .ports =
aa8223c7
ACM
1393 { .sport = tcp_hdr(skb)->dest,
1394 .dport = tcp_hdr(skb)->source } },
86b08d86
KK
1395 .proto = sk->sk_protocol,
1396 .flags = ip_reply_arg_flowi_flags(arg) };
beb8d13b 1397 security_skb_classify_flow(skb, &fl);
3b1e0a65 1398 if (ip_route_output_key(sock_net(sk), &rt, &fl))
1da177e4
LT
1399 return;
1400 }
1401
1402 /* And let IP do all the hard work.
1403
1404 This chunk is not reenterable, hence spinlock.
1405 Note that it uses the fact, that this function is called
1406 with locally disabled BH and that sk cannot be already spinlocked.
1407 */
1408 bh_lock_sock(sk);
eddc9ec5 1409 inet->tos = ip_hdr(skb)->tos;
1da177e4 1410 sk->sk_priority = skb->priority;
eddc9ec5 1411 sk->sk_protocol = ip_hdr(skb)->protocol;
f0e48dbf 1412 sk->sk_bound_dev_if = arg->bound_dev_if;
1da177e4 1413 ip_append_data(sk, ip_reply_glue_bits, arg->iov->iov_base, len, 0,
2e77d89b 1414 &ipc, &rt, MSG_DONTWAIT);
1da177e4
LT
1415 if ((skb = skb_peek(&sk->sk_write_queue)) != NULL) {
1416 if (arg->csumoffset >= 0)
9c70220b
ACM
1417 *((__sum16 *)skb_transport_header(skb) +
1418 arg->csumoffset) = csum_fold(csum_add(skb->csum,
1419 arg->csum));
1da177e4
LT
1420 skb->ip_summed = CHECKSUM_NONE;
1421 ip_push_pending_frames(sk);
1422 }
1423
1424 bh_unlock_sock(sk);
1425
1426 ip_rt_put(rt);
1427}
1428
1da177e4
LT
1429void __init ip_init(void)
1430{
1da177e4
LT
1431 ip_rt_init();
1432 inet_initpeers();
1433
1434#if defined(CONFIG_IP_MULTICAST) && defined(CONFIG_PROC_FS)
1435 igmp_mc_proc_init();
1436#endif
1437}
1438
1da177e4
LT
1439EXPORT_SYMBOL(ip_generic_getfrag);
1440EXPORT_SYMBOL(ip_queue_xmit);
1441EXPORT_SYMBOL(ip_send_check);