]> rtime.felk.cvut.cz Git - linux-imx.git/blob - net/packet/af_packet.c
af_packet: prevent information leak
[linux-imx.git] / net / packet / af_packet.c
1 /*
2  * INET         An implementation of the TCP/IP protocol suite for the LINUX
3  *              operating system.  INET is implemented using the  BSD Socket
4  *              interface as the means of communication with the user level.
5  *
6  *              PACKET - implements raw packet sockets.
7  *
8  * Authors:     Ross Biro
9  *              Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
10  *              Alan Cox, <gw4pts@gw4pts.ampr.org>
11  *
12  * Fixes:
13  *              Alan Cox        :       verify_area() now used correctly
14  *              Alan Cox        :       new skbuff lists, look ma no backlogs!
15  *              Alan Cox        :       tidied skbuff lists.
16  *              Alan Cox        :       Now uses generic datagram routines I
17  *                                      added. Also fixed the peek/read crash
18  *                                      from all old Linux datagram code.
19  *              Alan Cox        :       Uses the improved datagram code.
20  *              Alan Cox        :       Added NULL's for socket options.
21  *              Alan Cox        :       Re-commented the code.
22  *              Alan Cox        :       Use new kernel side addressing
23  *              Rob Janssen     :       Correct MTU usage.
24  *              Dave Platt      :       Counter leaks caused by incorrect
25  *                                      interrupt locking and some slightly
26  *                                      dubious gcc output. Can you read
27  *                                      compiler: it said _VOLATILE_
28  *      Richard Kooijman        :       Timestamp fixes.
29  *              Alan Cox        :       New buffers. Use sk->mac.raw.
30  *              Alan Cox        :       sendmsg/recvmsg support.
31  *              Alan Cox        :       Protocol setting support
32  *      Alexey Kuznetsov        :       Untied from IPv4 stack.
33  *      Cyrus Durgin            :       Fixed kerneld for kmod.
34  *      Michal Ostrowski        :       Module initialization cleanup.
35  *         Ulises Alonso        :       Frame number limit removal and
36  *                                      packet_set_ring memory leak.
37  *              Eric Biederman  :       Allow for > 8 byte hardware addresses.
38  *                                      The convention is that longer addresses
39  *                                      will simply extend the hardware address
40  *                                      byte arrays at the end of sockaddr_ll
41  *                                      and packet_mreq.
42  *              Johann Baudy    :       Added TX RING.
43  *
44  *              This program is free software; you can redistribute it and/or
45  *              modify it under the terms of the GNU General Public License
46  *              as published by the Free Software Foundation; either version
47  *              2 of the License, or (at your option) any later version.
48  *
49  */
50
51 #include <linux/types.h>
52 #include <linux/mm.h>
53 #include <linux/capability.h>
54 #include <linux/fcntl.h>
55 #include <linux/socket.h>
56 #include <linux/in.h>
57 #include <linux/inet.h>
58 #include <linux/netdevice.h>
59 #include <linux/if_packet.h>
60 #include <linux/wireless.h>
61 #include <linux/kernel.h>
62 #include <linux/kmod.h>
63 #include <linux/slab.h>
64 #include <net/net_namespace.h>
65 #include <net/ip.h>
66 #include <net/protocol.h>
67 #include <linux/skbuff.h>
68 #include <net/sock.h>
69 #include <linux/errno.h>
70 #include <linux/timer.h>
71 #include <asm/system.h>
72 #include <asm/uaccess.h>
73 #include <asm/ioctls.h>
74 #include <asm/page.h>
75 #include <asm/cacheflush.h>
76 #include <asm/io.h>
77 #include <linux/proc_fs.h>
78 #include <linux/seq_file.h>
79 #include <linux/poll.h>
80 #include <linux/module.h>
81 #include <linux/init.h>
82 #include <linux/mutex.h>
83 #include <linux/if_vlan.h>
84 #include <linux/virtio_net.h>
85 #include <linux/errqueue.h>
86
87 #ifdef CONFIG_INET
88 #include <net/inet_common.h>
89 #endif
90
91 /*
92    Assumptions:
93    - if device has no dev->hard_header routine, it adds and removes ll header
94      inside itself. In this case ll header is invisible outside of device,
95      but higher levels still should reserve dev->hard_header_len.
96      Some devices are enough clever to reallocate skb, when header
97      will not fit to reserved space (tunnel), another ones are silly
98      (PPP).
99    - packet socket receives packets with pulled ll header,
100      so that SOCK_RAW should push it back.
101
102 On receive:
103 -----------
104
105 Incoming, dev->hard_header!=NULL
106    mac_header -> ll header
107    data       -> data
108
109 Outgoing, dev->hard_header!=NULL
110    mac_header -> ll header
111    data       -> ll header
112
113 Incoming, dev->hard_header==NULL
114    mac_header -> UNKNOWN position. It is very likely, that it points to ll
115                  header.  PPP makes it, that is wrong, because introduce
116                  assymetry between rx and tx paths.
117    data       -> data
118
119 Outgoing, dev->hard_header==NULL
120    mac_header -> data. ll header is still not built!
121    data       -> data
122
123 Resume
124   If dev->hard_header==NULL we are unlikely to restore sensible ll header.
125
126
127 On transmit:
128 ------------
129
130 dev->hard_header != NULL
131    mac_header -> ll header
132    data       -> ll header
133
134 dev->hard_header == NULL (ll header is added by device, we cannot control it)
135    mac_header -> data
136    data       -> data
137
138    We should set nh.raw on output to correct posistion,
139    packet classifier depends on it.
140  */
141
142 /* Private packet socket structures. */
143
144 struct packet_mclist {
145         struct packet_mclist    *next;
146         int                     ifindex;
147         int                     count;
148         unsigned short          type;
149         unsigned short          alen;
150         unsigned char           addr[MAX_ADDR_LEN];
151 };
152 /* identical to struct packet_mreq except it has
153  * a longer address field.
154  */
155 struct packet_mreq_max {
156         int             mr_ifindex;
157         unsigned short  mr_type;
158         unsigned short  mr_alen;
159         unsigned char   mr_address[MAX_ADDR_LEN];
160 };
161
162 static int packet_set_ring(struct sock *sk, struct tpacket_req *req,
163                 int closing, int tx_ring);
164
165 struct packet_ring_buffer {
166         char                    **pg_vec;
167         unsigned int            head;
168         unsigned int            frames_per_block;
169         unsigned int            frame_size;
170         unsigned int            frame_max;
171
172         unsigned int            pg_vec_order;
173         unsigned int            pg_vec_pages;
174         unsigned int            pg_vec_len;
175
176         atomic_t                pending;
177 };
178
179 struct packet_sock;
180 static int tpacket_snd(struct packet_sock *po, struct msghdr *msg);
181
182 static void packet_flush_mclist(struct sock *sk);
183
184 struct packet_sock {
185         /* struct sock has to be the first member of packet_sock */
186         struct sock             sk;
187         struct tpacket_stats    stats;
188         struct packet_ring_buffer       rx_ring;
189         struct packet_ring_buffer       tx_ring;
190         int                     copy_thresh;
191         spinlock_t              bind_lock;
192         struct mutex            pg_vec_lock;
193         unsigned int            running:1,      /* prot_hook is attached*/
194                                 auxdata:1,
195                                 origdev:1,
196                                 has_vnet_hdr:1;
197         int                     ifindex;        /* bound device         */
198         __be16                  num;
199         struct packet_mclist    *mclist;
200         atomic_t                mapped;
201         enum tpacket_versions   tp_version;
202         unsigned int            tp_hdrlen;
203         unsigned int            tp_reserve;
204         unsigned int            tp_loss:1;
205         struct packet_type      prot_hook ____cacheline_aligned_in_smp;
206 };
207
208 struct packet_skb_cb {
209         unsigned int origlen;
210         union {
211                 struct sockaddr_pkt pkt;
212                 struct sockaddr_ll ll;
213         } sa;
214 };
215
216 #define PACKET_SKB_CB(__skb)    ((struct packet_skb_cb *)((__skb)->cb))
217
218 static void __packet_set_status(struct packet_sock *po, void *frame, int status)
219 {
220         union {
221                 struct tpacket_hdr *h1;
222                 struct tpacket2_hdr *h2;
223                 void *raw;
224         } h;
225
226         h.raw = frame;
227         switch (po->tp_version) {
228         case TPACKET_V1:
229                 h.h1->tp_status = status;
230                 flush_dcache_page(virt_to_page(&h.h1->tp_status));
231                 break;
232         case TPACKET_V2:
233                 h.h2->tp_status = status;
234                 flush_dcache_page(virt_to_page(&h.h2->tp_status));
235                 break;
236         default:
237                 pr_err("TPACKET version not supported\n");
238                 BUG();
239         }
240
241         smp_wmb();
242 }
243
244 static int __packet_get_status(struct packet_sock *po, void *frame)
245 {
246         union {
247                 struct tpacket_hdr *h1;
248                 struct tpacket2_hdr *h2;
249                 void *raw;
250         } h;
251
252         smp_rmb();
253
254         h.raw = frame;
255         switch (po->tp_version) {
256         case TPACKET_V1:
257                 flush_dcache_page(virt_to_page(&h.h1->tp_status));
258                 return h.h1->tp_status;
259         case TPACKET_V2:
260                 flush_dcache_page(virt_to_page(&h.h2->tp_status));
261                 return h.h2->tp_status;
262         default:
263                 pr_err("TPACKET version not supported\n");
264                 BUG();
265                 return 0;
266         }
267 }
268
269 static void *packet_lookup_frame(struct packet_sock *po,
270                 struct packet_ring_buffer *rb,
271                 unsigned int position,
272                 int status)
273 {
274         unsigned int pg_vec_pos, frame_offset;
275         union {
276                 struct tpacket_hdr *h1;
277                 struct tpacket2_hdr *h2;
278                 void *raw;
279         } h;
280
281         pg_vec_pos = position / rb->frames_per_block;
282         frame_offset = position % rb->frames_per_block;
283
284         h.raw = rb->pg_vec[pg_vec_pos] + (frame_offset * rb->frame_size);
285
286         if (status != __packet_get_status(po, h.raw))
287                 return NULL;
288
289         return h.raw;
290 }
291
292 static inline void *packet_current_frame(struct packet_sock *po,
293                 struct packet_ring_buffer *rb,
294                 int status)
295 {
296         return packet_lookup_frame(po, rb, rb->head, status);
297 }
298
299 static inline void *packet_previous_frame(struct packet_sock *po,
300                 struct packet_ring_buffer *rb,
301                 int status)
302 {
303         unsigned int previous = rb->head ? rb->head - 1 : rb->frame_max;
304         return packet_lookup_frame(po, rb, previous, status);
305 }
306
307 static inline void packet_increment_head(struct packet_ring_buffer *buff)
308 {
309         buff->head = buff->head != buff->frame_max ? buff->head+1 : 0;
310 }
311
312 static inline struct packet_sock *pkt_sk(struct sock *sk)
313 {
314         return (struct packet_sock *)sk;
315 }
316
317 static void packet_sock_destruct(struct sock *sk)
318 {
319         skb_queue_purge(&sk->sk_error_queue);
320
321         WARN_ON(atomic_read(&sk->sk_rmem_alloc));
322         WARN_ON(atomic_read(&sk->sk_wmem_alloc));
323
324         if (!sock_flag(sk, SOCK_DEAD)) {
325                 pr_err("Attempt to release alive packet socket: %p\n", sk);
326                 return;
327         }
328
329         sk_refcnt_debug_dec(sk);
330 }
331
332
333 static const struct proto_ops packet_ops;
334
335 static const struct proto_ops packet_ops_spkt;
336
337 static int packet_rcv_spkt(struct sk_buff *skb, struct net_device *dev,
338                            struct packet_type *pt, struct net_device *orig_dev)
339 {
340         struct sock *sk;
341         struct sockaddr_pkt *spkt;
342
343         /*
344          *      When we registered the protocol we saved the socket in the data
345          *      field for just this event.
346          */
347
348         sk = pt->af_packet_priv;
349
350         /*
351          *      Yank back the headers [hope the device set this
352          *      right or kerboom...]
353          *
354          *      Incoming packets have ll header pulled,
355          *      push it back.
356          *
357          *      For outgoing ones skb->data == skb_mac_header(skb)
358          *      so that this procedure is noop.
359          */
360
361         if (skb->pkt_type == PACKET_LOOPBACK)
362                 goto out;
363
364         if (!net_eq(dev_net(dev), sock_net(sk)))
365                 goto out;
366
367         skb = skb_share_check(skb, GFP_ATOMIC);
368         if (skb == NULL)
369                 goto oom;
370
371         /* drop any routing info */
372         skb_dst_drop(skb);
373
374         /* drop conntrack reference */
375         nf_reset(skb);
376
377         spkt = &PACKET_SKB_CB(skb)->sa.pkt;
378
379         skb_push(skb, skb->data - skb_mac_header(skb));
380
381         /*
382          *      The SOCK_PACKET socket receives _all_ frames.
383          */
384
385         spkt->spkt_family = dev->type;
386         strlcpy(spkt->spkt_device, dev->name, sizeof(spkt->spkt_device));
387         spkt->spkt_protocol = skb->protocol;
388
389         /*
390          *      Charge the memory to the socket. This is done specifically
391          *      to prevent sockets using all the memory up.
392          */
393
394         if (sock_queue_rcv_skb(sk, skb) == 0)
395                 return 0;
396
397 out:
398         kfree_skb(skb);
399 oom:
400         return 0;
401 }
402
403
404 /*
405  *      Output a raw packet to a device layer. This bypasses all the other
406  *      protocol layers and you must therefore supply it with a complete frame
407  */
408
409 static int packet_sendmsg_spkt(struct kiocb *iocb, struct socket *sock,
410                                struct msghdr *msg, size_t len)
411 {
412         struct sock *sk = sock->sk;
413         struct sockaddr_pkt *saddr = (struct sockaddr_pkt *)msg->msg_name;
414         struct sk_buff *skb = NULL;
415         struct net_device *dev;
416         __be16 proto = 0;
417         int err;
418
419         /*
420          *      Get and verify the address.
421          */
422
423         if (saddr) {
424                 if (msg->msg_namelen < sizeof(struct sockaddr))
425                         return -EINVAL;
426                 if (msg->msg_namelen == sizeof(struct sockaddr_pkt))
427                         proto = saddr->spkt_protocol;
428         } else
429                 return -ENOTCONN;       /* SOCK_PACKET must be sent giving an address */
430
431         /*
432          *      Find the device first to size check it
433          */
434
435         saddr->spkt_device[13] = 0;
436 retry:
437         rcu_read_lock();
438         dev = dev_get_by_name_rcu(sock_net(sk), saddr->spkt_device);
439         err = -ENODEV;
440         if (dev == NULL)
441                 goto out_unlock;
442
443         err = -ENETDOWN;
444         if (!(dev->flags & IFF_UP))
445                 goto out_unlock;
446
447         /*
448          * You may not queue a frame bigger than the mtu. This is the lowest level
449          * raw protocol and you must do your own fragmentation at this level.
450          */
451
452         err = -EMSGSIZE;
453         if (len > dev->mtu + dev->hard_header_len)
454                 goto out_unlock;
455
456         if (!skb) {
457                 size_t reserved = LL_RESERVED_SPACE(dev);
458                 unsigned int hhlen = dev->header_ops ? dev->hard_header_len : 0;
459
460                 rcu_read_unlock();
461                 skb = sock_wmalloc(sk, len + reserved, 0, GFP_KERNEL);
462                 if (skb == NULL)
463                         return -ENOBUFS;
464                 /* FIXME: Save some space for broken drivers that write a hard
465                  * header at transmission time by themselves. PPP is the notable
466                  * one here. This should really be fixed at the driver level.
467                  */
468                 skb_reserve(skb, reserved);
469                 skb_reset_network_header(skb);
470
471                 /* Try to align data part correctly */
472                 if (hhlen) {
473                         skb->data -= hhlen;
474                         skb->tail -= hhlen;
475                         if (len < hhlen)
476                                 skb_reset_network_header(skb);
477                 }
478                 err = memcpy_fromiovec(skb_put(skb, len), msg->msg_iov, len);
479                 if (err)
480                         goto out_free;
481                 goto retry;
482         }
483
484
485         skb->protocol = proto;
486         skb->dev = dev;
487         skb->priority = sk->sk_priority;
488         skb->mark = sk->sk_mark;
489         err = sock_tx_timestamp(msg, sk, skb_tx(skb));
490         if (err < 0)
491                 goto out_unlock;
492
493         dev_queue_xmit(skb);
494         rcu_read_unlock();
495         return len;
496
497 out_unlock:
498         rcu_read_unlock();
499 out_free:
500         kfree_skb(skb);
501         return err;
502 }
503
504 static inline unsigned int run_filter(struct sk_buff *skb, struct sock *sk,
505                                       unsigned int res)
506 {
507         struct sk_filter *filter;
508
509         rcu_read_lock_bh();
510         filter = rcu_dereference_bh(sk->sk_filter);
511         if (filter != NULL)
512                 res = sk_run_filter(skb, filter->insns, filter->len);
513         rcu_read_unlock_bh();
514
515         return res;
516 }
517
518 /*
519    This function makes lazy skb cloning in hope that most of packets
520    are discarded by BPF.
521
522    Note tricky part: we DO mangle shared skb! skb->data, skb->len
523    and skb->cb are mangled. It works because (and until) packets
524    falling here are owned by current CPU. Output packets are cloned
525    by dev_queue_xmit_nit(), input packets are processed by net_bh
526    sequencially, so that if we return skb to original state on exit,
527    we will not harm anyone.
528  */
529
530 static int packet_rcv(struct sk_buff *skb, struct net_device *dev,
531                       struct packet_type *pt, struct net_device *orig_dev)
532 {
533         struct sock *sk;
534         struct sockaddr_ll *sll;
535         struct packet_sock *po;
536         u8 *skb_head = skb->data;
537         int skb_len = skb->len;
538         unsigned int snaplen, res;
539
540         if (skb->pkt_type == PACKET_LOOPBACK)
541                 goto drop;
542
543         sk = pt->af_packet_priv;
544         po = pkt_sk(sk);
545
546         if (!net_eq(dev_net(dev), sock_net(sk)))
547                 goto drop;
548
549         skb->dev = dev;
550
551         if (dev->header_ops) {
552                 /* The device has an explicit notion of ll header,
553                    exported to higher levels.
554
555                    Otherwise, the device hides datails of it frame
556                    structure, so that corresponding packet head
557                    never delivered to user.
558                  */
559                 if (sk->sk_type != SOCK_DGRAM)
560                         skb_push(skb, skb->data - skb_mac_header(skb));
561                 else if (skb->pkt_type == PACKET_OUTGOING) {
562                         /* Special case: outgoing packets have ll header at head */
563                         skb_pull(skb, skb_network_offset(skb));
564                 }
565         }
566
567         snaplen = skb->len;
568
569         res = run_filter(skb, sk, snaplen);
570         if (!res)
571                 goto drop_n_restore;
572         if (snaplen > res)
573                 snaplen = res;
574
575         if (atomic_read(&sk->sk_rmem_alloc) + skb->truesize >=
576             (unsigned)sk->sk_rcvbuf)
577                 goto drop_n_acct;
578
579         if (skb_shared(skb)) {
580                 struct sk_buff *nskb = skb_clone(skb, GFP_ATOMIC);
581                 if (nskb == NULL)
582                         goto drop_n_acct;
583
584                 if (skb_head != skb->data) {
585                         skb->data = skb_head;
586                         skb->len = skb_len;
587                 }
588                 kfree_skb(skb);
589                 skb = nskb;
590         }
591
592         BUILD_BUG_ON(sizeof(*PACKET_SKB_CB(skb)) + MAX_ADDR_LEN - 8 >
593                      sizeof(skb->cb));
594
595         sll = &PACKET_SKB_CB(skb)->sa.ll;
596         sll->sll_family = AF_PACKET;
597         sll->sll_hatype = dev->type;
598         sll->sll_protocol = skb->protocol;
599         sll->sll_pkttype = skb->pkt_type;
600         if (unlikely(po->origdev))
601                 sll->sll_ifindex = orig_dev->ifindex;
602         else
603                 sll->sll_ifindex = dev->ifindex;
604
605         sll->sll_halen = dev_parse_header(skb, sll->sll_addr);
606
607         PACKET_SKB_CB(skb)->origlen = skb->len;
608
609         if (pskb_trim(skb, snaplen))
610                 goto drop_n_acct;
611
612         skb_set_owner_r(skb, sk);
613         skb->dev = NULL;
614         skb_dst_drop(skb);
615
616         /* drop conntrack reference */
617         nf_reset(skb);
618
619         spin_lock(&sk->sk_receive_queue.lock);
620         po->stats.tp_packets++;
621         skb->dropcount = atomic_read(&sk->sk_drops);
622         __skb_queue_tail(&sk->sk_receive_queue, skb);
623         spin_unlock(&sk->sk_receive_queue.lock);
624         sk->sk_data_ready(sk, skb->len);
625         return 0;
626
627 drop_n_acct:
628         po->stats.tp_drops = atomic_inc_return(&sk->sk_drops);
629
630 drop_n_restore:
631         if (skb_head != skb->data && skb_shared(skb)) {
632                 skb->data = skb_head;
633                 skb->len = skb_len;
634         }
635 drop:
636         consume_skb(skb);
637         return 0;
638 }
639
640 static int tpacket_rcv(struct sk_buff *skb, struct net_device *dev,
641                        struct packet_type *pt, struct net_device *orig_dev)
642 {
643         struct sock *sk;
644         struct packet_sock *po;
645         struct sockaddr_ll *sll;
646         union {
647                 struct tpacket_hdr *h1;
648                 struct tpacket2_hdr *h2;
649                 void *raw;
650         } h;
651         u8 *skb_head = skb->data;
652         int skb_len = skb->len;
653         unsigned int snaplen, res;
654         unsigned long status = TP_STATUS_LOSING|TP_STATUS_USER;
655         unsigned short macoff, netoff, hdrlen;
656         struct sk_buff *copy_skb = NULL;
657         struct timeval tv;
658         struct timespec ts;
659
660         if (skb->pkt_type == PACKET_LOOPBACK)
661                 goto drop;
662
663         sk = pt->af_packet_priv;
664         po = pkt_sk(sk);
665
666         if (!net_eq(dev_net(dev), sock_net(sk)))
667                 goto drop;
668
669         if (dev->header_ops) {
670                 if (sk->sk_type != SOCK_DGRAM)
671                         skb_push(skb, skb->data - skb_mac_header(skb));
672                 else if (skb->pkt_type == PACKET_OUTGOING) {
673                         /* Special case: outgoing packets have ll header at head */
674                         skb_pull(skb, skb_network_offset(skb));
675                 }
676         }
677
678         if (skb->ip_summed == CHECKSUM_PARTIAL)
679                 status |= TP_STATUS_CSUMNOTREADY;
680
681         snaplen = skb->len;
682
683         res = run_filter(skb, sk, snaplen);
684         if (!res)
685                 goto drop_n_restore;
686         if (snaplen > res)
687                 snaplen = res;
688
689         if (sk->sk_type == SOCK_DGRAM) {
690                 macoff = netoff = TPACKET_ALIGN(po->tp_hdrlen) + 16 +
691                                   po->tp_reserve;
692         } else {
693                 unsigned maclen = skb_network_offset(skb);
694                 netoff = TPACKET_ALIGN(po->tp_hdrlen +
695                                        (maclen < 16 ? 16 : maclen)) +
696                         po->tp_reserve;
697                 macoff = netoff - maclen;
698         }
699
700         if (macoff + snaplen > po->rx_ring.frame_size) {
701                 if (po->copy_thresh &&
702                     atomic_read(&sk->sk_rmem_alloc) + skb->truesize <
703                     (unsigned)sk->sk_rcvbuf) {
704                         if (skb_shared(skb)) {
705                                 copy_skb = skb_clone(skb, GFP_ATOMIC);
706                         } else {
707                                 copy_skb = skb_get(skb);
708                                 skb_head = skb->data;
709                         }
710                         if (copy_skb)
711                                 skb_set_owner_r(copy_skb, sk);
712                 }
713                 snaplen = po->rx_ring.frame_size - macoff;
714                 if ((int)snaplen < 0)
715                         snaplen = 0;
716         }
717
718         spin_lock(&sk->sk_receive_queue.lock);
719         h.raw = packet_current_frame(po, &po->rx_ring, TP_STATUS_KERNEL);
720         if (!h.raw)
721                 goto ring_is_full;
722         packet_increment_head(&po->rx_ring);
723         po->stats.tp_packets++;
724         if (copy_skb) {
725                 status |= TP_STATUS_COPY;
726                 __skb_queue_tail(&sk->sk_receive_queue, copy_skb);
727         }
728         if (!po->stats.tp_drops)
729                 status &= ~TP_STATUS_LOSING;
730         spin_unlock(&sk->sk_receive_queue.lock);
731
732         skb_copy_bits(skb, 0, h.raw + macoff, snaplen);
733
734         switch (po->tp_version) {
735         case TPACKET_V1:
736                 h.h1->tp_len = skb->len;
737                 h.h1->tp_snaplen = snaplen;
738                 h.h1->tp_mac = macoff;
739                 h.h1->tp_net = netoff;
740                 if (skb->tstamp.tv64)
741                         tv = ktime_to_timeval(skb->tstamp);
742                 else
743                         do_gettimeofday(&tv);
744                 h.h1->tp_sec = tv.tv_sec;
745                 h.h1->tp_usec = tv.tv_usec;
746                 hdrlen = sizeof(*h.h1);
747                 break;
748         case TPACKET_V2:
749                 h.h2->tp_len = skb->len;
750                 h.h2->tp_snaplen = snaplen;
751                 h.h2->tp_mac = macoff;
752                 h.h2->tp_net = netoff;
753                 if (skb->tstamp.tv64)
754                         ts = ktime_to_timespec(skb->tstamp);
755                 else
756                         getnstimeofday(&ts);
757                 h.h2->tp_sec = ts.tv_sec;
758                 h.h2->tp_nsec = ts.tv_nsec;
759                 h.h2->tp_vlan_tci = vlan_tx_tag_get(skb);
760                 h.h2->tp_padding = 0;
761                 hdrlen = sizeof(*h.h2);
762                 break;
763         default:
764                 BUG();
765         }
766
767         sll = h.raw + TPACKET_ALIGN(hdrlen);
768         sll->sll_halen = dev_parse_header(skb, sll->sll_addr);
769         sll->sll_family = AF_PACKET;
770         sll->sll_hatype = dev->type;
771         sll->sll_protocol = skb->protocol;
772         sll->sll_pkttype = skb->pkt_type;
773         if (unlikely(po->origdev))
774                 sll->sll_ifindex = orig_dev->ifindex;
775         else
776                 sll->sll_ifindex = dev->ifindex;
777
778         __packet_set_status(po, h.raw, status);
779         smp_mb();
780         {
781                 struct page *p_start, *p_end;
782                 u8 *h_end = h.raw + macoff + snaplen - 1;
783
784                 p_start = virt_to_page(h.raw);
785                 p_end = virt_to_page(h_end);
786                 while (p_start <= p_end) {
787                         flush_dcache_page(p_start);
788                         p_start++;
789                 }
790         }
791
792         sk->sk_data_ready(sk, 0);
793
794 drop_n_restore:
795         if (skb_head != skb->data && skb_shared(skb)) {
796                 skb->data = skb_head;
797                 skb->len = skb_len;
798         }
799 drop:
800         kfree_skb(skb);
801         return 0;
802
803 ring_is_full:
804         po->stats.tp_drops++;
805         spin_unlock(&sk->sk_receive_queue.lock);
806
807         sk->sk_data_ready(sk, 0);
808         kfree_skb(copy_skb);
809         goto drop_n_restore;
810 }
811
812 static void tpacket_destruct_skb(struct sk_buff *skb)
813 {
814         struct packet_sock *po = pkt_sk(skb->sk);
815         void *ph;
816
817         BUG_ON(skb == NULL);
818
819         if (likely(po->tx_ring.pg_vec)) {
820                 ph = skb_shinfo(skb)->destructor_arg;
821                 BUG_ON(__packet_get_status(po, ph) != TP_STATUS_SENDING);
822                 BUG_ON(atomic_read(&po->tx_ring.pending) == 0);
823                 atomic_dec(&po->tx_ring.pending);
824                 __packet_set_status(po, ph, TP_STATUS_AVAILABLE);
825         }
826
827         sock_wfree(skb);
828 }
829
830 static int tpacket_fill_skb(struct packet_sock *po, struct sk_buff *skb,
831                 void *frame, struct net_device *dev, int size_max,
832                 __be16 proto, unsigned char *addr)
833 {
834         union {
835                 struct tpacket_hdr *h1;
836                 struct tpacket2_hdr *h2;
837                 void *raw;
838         } ph;
839         int to_write, offset, len, tp_len, nr_frags, len_max;
840         struct socket *sock = po->sk.sk_socket;
841         struct page *page;
842         void *data;
843         int err;
844
845         ph.raw = frame;
846
847         skb->protocol = proto;
848         skb->dev = dev;
849         skb->priority = po->sk.sk_priority;
850         skb->mark = po->sk.sk_mark;
851         skb_shinfo(skb)->destructor_arg = ph.raw;
852
853         switch (po->tp_version) {
854         case TPACKET_V2:
855                 tp_len = ph.h2->tp_len;
856                 break;
857         default:
858                 tp_len = ph.h1->tp_len;
859                 break;
860         }
861         if (unlikely(tp_len > size_max)) {
862                 pr_err("packet size is too long (%d > %d)\n", tp_len, size_max);
863                 return -EMSGSIZE;
864         }
865
866         skb_reserve(skb, LL_RESERVED_SPACE(dev));
867         skb_reset_network_header(skb);
868
869         data = ph.raw + po->tp_hdrlen - sizeof(struct sockaddr_ll);
870         to_write = tp_len;
871
872         if (sock->type == SOCK_DGRAM) {
873                 err = dev_hard_header(skb, dev, ntohs(proto), addr,
874                                 NULL, tp_len);
875                 if (unlikely(err < 0))
876                         return -EINVAL;
877         } else if (dev->hard_header_len) {
878                 /* net device doesn't like empty head */
879                 if (unlikely(tp_len <= dev->hard_header_len)) {
880                         pr_err("packet size is too short (%d < %d)\n",
881                                tp_len, dev->hard_header_len);
882                         return -EINVAL;
883                 }
884
885                 skb_push(skb, dev->hard_header_len);
886                 err = skb_store_bits(skb, 0, data,
887                                 dev->hard_header_len);
888                 if (unlikely(err))
889                         return err;
890
891                 data += dev->hard_header_len;
892                 to_write -= dev->hard_header_len;
893         }
894
895         err = -EFAULT;
896         page = virt_to_page(data);
897         offset = offset_in_page(data);
898         len_max = PAGE_SIZE - offset;
899         len = ((to_write > len_max) ? len_max : to_write);
900
901         skb->data_len = to_write;
902         skb->len += to_write;
903         skb->truesize += to_write;
904         atomic_add(to_write, &po->sk.sk_wmem_alloc);
905
906         while (likely(to_write)) {
907                 nr_frags = skb_shinfo(skb)->nr_frags;
908
909                 if (unlikely(nr_frags >= MAX_SKB_FRAGS)) {
910                         pr_err("Packet exceed the number of skb frags(%lu)\n",
911                                MAX_SKB_FRAGS);
912                         return -EFAULT;
913                 }
914
915                 flush_dcache_page(page);
916                 get_page(page);
917                 skb_fill_page_desc(skb,
918                                 nr_frags,
919                                 page++, offset, len);
920                 to_write -= len;
921                 offset = 0;
922                 len_max = PAGE_SIZE;
923                 len = ((to_write > len_max) ? len_max : to_write);
924         }
925
926         return tp_len;
927 }
928
929 static int tpacket_snd(struct packet_sock *po, struct msghdr *msg)
930 {
931         struct socket *sock;
932         struct sk_buff *skb;
933         struct net_device *dev;
934         __be16 proto;
935         int ifindex, err, reserve = 0;
936         void *ph;
937         struct sockaddr_ll *saddr = (struct sockaddr_ll *)msg->msg_name;
938         int tp_len, size_max;
939         unsigned char *addr;
940         int len_sum = 0;
941         int status = 0;
942
943         sock = po->sk.sk_socket;
944
945         mutex_lock(&po->pg_vec_lock);
946
947         err = -EBUSY;
948         if (saddr == NULL) {
949                 ifindex = po->ifindex;
950                 proto   = po->num;
951                 addr    = NULL;
952         } else {
953                 err = -EINVAL;
954                 if (msg->msg_namelen < sizeof(struct sockaddr_ll))
955                         goto out;
956                 if (msg->msg_namelen < (saddr->sll_halen
957                                         + offsetof(struct sockaddr_ll,
958                                                 sll_addr)))
959                         goto out;
960                 ifindex = saddr->sll_ifindex;
961                 proto   = saddr->sll_protocol;
962                 addr    = saddr->sll_addr;
963         }
964
965         dev = dev_get_by_index(sock_net(&po->sk), ifindex);
966         err = -ENXIO;
967         if (unlikely(dev == NULL))
968                 goto out;
969
970         reserve = dev->hard_header_len;
971
972         err = -ENETDOWN;
973         if (unlikely(!(dev->flags & IFF_UP)))
974                 goto out_put;
975
976         size_max = po->tx_ring.frame_size
977                 - (po->tp_hdrlen - sizeof(struct sockaddr_ll));
978
979         if (size_max > dev->mtu + reserve)
980                 size_max = dev->mtu + reserve;
981
982         do {
983                 ph = packet_current_frame(po, &po->tx_ring,
984                                 TP_STATUS_SEND_REQUEST);
985
986                 if (unlikely(ph == NULL)) {
987                         schedule();
988                         continue;
989                 }
990
991                 status = TP_STATUS_SEND_REQUEST;
992                 skb = sock_alloc_send_skb(&po->sk,
993                                 LL_ALLOCATED_SPACE(dev)
994                                 + sizeof(struct sockaddr_ll),
995                                 0, &err);
996
997                 if (unlikely(skb == NULL))
998                         goto out_status;
999
1000                 tp_len = tpacket_fill_skb(po, skb, ph, dev, size_max, proto,
1001                                 addr);
1002
1003                 if (unlikely(tp_len < 0)) {
1004                         if (po->tp_loss) {
1005                                 __packet_set_status(po, ph,
1006                                                 TP_STATUS_AVAILABLE);
1007                                 packet_increment_head(&po->tx_ring);
1008                                 kfree_skb(skb);
1009                                 continue;
1010                         } else {
1011                                 status = TP_STATUS_WRONG_FORMAT;
1012                                 err = tp_len;
1013                                 goto out_status;
1014                         }
1015                 }
1016
1017                 skb->destructor = tpacket_destruct_skb;
1018                 __packet_set_status(po, ph, TP_STATUS_SENDING);
1019                 atomic_inc(&po->tx_ring.pending);
1020
1021                 status = TP_STATUS_SEND_REQUEST;
1022                 err = dev_queue_xmit(skb);
1023                 if (unlikely(err > 0)) {
1024                         err = net_xmit_errno(err);
1025                         if (err && __packet_get_status(po, ph) ==
1026                                    TP_STATUS_AVAILABLE) {
1027                                 /* skb was destructed already */
1028                                 skb = NULL;
1029                                 goto out_status;
1030                         }
1031                         /*
1032                          * skb was dropped but not destructed yet;
1033                          * let's treat it like congestion or err < 0
1034                          */
1035                         err = 0;
1036                 }
1037                 packet_increment_head(&po->tx_ring);
1038                 len_sum += tp_len;
1039         } while (likely((ph != NULL) ||
1040                         ((!(msg->msg_flags & MSG_DONTWAIT)) &&
1041                          (atomic_read(&po->tx_ring.pending))))
1042                 );
1043
1044         err = len_sum;
1045         goto out_put;
1046
1047 out_status:
1048         __packet_set_status(po, ph, status);
1049         kfree_skb(skb);
1050 out_put:
1051         dev_put(dev);
1052 out:
1053         mutex_unlock(&po->pg_vec_lock);
1054         return err;
1055 }
1056
1057 static inline struct sk_buff *packet_alloc_skb(struct sock *sk, size_t prepad,
1058                                                size_t reserve, size_t len,
1059                                                size_t linear, int noblock,
1060                                                int *err)
1061 {
1062         struct sk_buff *skb;
1063
1064         /* Under a page?  Don't bother with paged skb. */
1065         if (prepad + len < PAGE_SIZE || !linear)
1066                 linear = len;
1067
1068         skb = sock_alloc_send_pskb(sk, prepad + linear, len - linear, noblock,
1069                                    err);
1070         if (!skb)
1071                 return NULL;
1072
1073         skb_reserve(skb, reserve);
1074         skb_put(skb, linear);
1075         skb->data_len = len - linear;
1076         skb->len += len - linear;
1077
1078         return skb;
1079 }
1080
1081 static int packet_snd(struct socket *sock,
1082                           struct msghdr *msg, size_t len)
1083 {
1084         struct sock *sk = sock->sk;
1085         struct sockaddr_ll *saddr = (struct sockaddr_ll *)msg->msg_name;
1086         struct sk_buff *skb;
1087         struct net_device *dev;
1088         __be16 proto;
1089         unsigned char *addr;
1090         int ifindex, err, reserve = 0;
1091         struct virtio_net_hdr vnet_hdr = { 0 };
1092         int offset = 0;
1093         int vnet_hdr_len;
1094         struct packet_sock *po = pkt_sk(sk);
1095         unsigned short gso_type = 0;
1096
1097         /*
1098          *      Get and verify the address.
1099          */
1100
1101         if (saddr == NULL) {
1102                 ifindex = po->ifindex;
1103                 proto   = po->num;
1104                 addr    = NULL;
1105         } else {
1106                 err = -EINVAL;
1107                 if (msg->msg_namelen < sizeof(struct sockaddr_ll))
1108                         goto out;
1109                 if (msg->msg_namelen < (saddr->sll_halen + offsetof(struct sockaddr_ll, sll_addr)))
1110                         goto out;
1111                 ifindex = saddr->sll_ifindex;
1112                 proto   = saddr->sll_protocol;
1113                 addr    = saddr->sll_addr;
1114         }
1115
1116
1117         dev = dev_get_by_index(sock_net(sk), ifindex);
1118         err = -ENXIO;
1119         if (dev == NULL)
1120                 goto out_unlock;
1121         if (sock->type == SOCK_RAW)
1122                 reserve = dev->hard_header_len;
1123
1124         err = -ENETDOWN;
1125         if (!(dev->flags & IFF_UP))
1126                 goto out_unlock;
1127
1128         if (po->has_vnet_hdr) {
1129                 vnet_hdr_len = sizeof(vnet_hdr);
1130
1131                 err = -EINVAL;
1132                 if (len < vnet_hdr_len)
1133                         goto out_unlock;
1134
1135                 len -= vnet_hdr_len;
1136
1137                 err = memcpy_fromiovec((void *)&vnet_hdr, msg->msg_iov,
1138                                        vnet_hdr_len);
1139                 if (err < 0)
1140                         goto out_unlock;
1141
1142                 if ((vnet_hdr.flags & VIRTIO_NET_HDR_F_NEEDS_CSUM) &&
1143                     (vnet_hdr.csum_start + vnet_hdr.csum_offset + 2 >
1144                       vnet_hdr.hdr_len))
1145                         vnet_hdr.hdr_len = vnet_hdr.csum_start +
1146                                                  vnet_hdr.csum_offset + 2;
1147
1148                 err = -EINVAL;
1149                 if (vnet_hdr.hdr_len > len)
1150                         goto out_unlock;
1151
1152                 if (vnet_hdr.gso_type != VIRTIO_NET_HDR_GSO_NONE) {
1153                         switch (vnet_hdr.gso_type & ~VIRTIO_NET_HDR_GSO_ECN) {
1154                         case VIRTIO_NET_HDR_GSO_TCPV4:
1155                                 gso_type = SKB_GSO_TCPV4;
1156                                 break;
1157                         case VIRTIO_NET_HDR_GSO_TCPV6:
1158                                 gso_type = SKB_GSO_TCPV6;
1159                                 break;
1160                         case VIRTIO_NET_HDR_GSO_UDP:
1161                                 gso_type = SKB_GSO_UDP;
1162                                 break;
1163                         default:
1164                                 goto out_unlock;
1165                         }
1166
1167                         if (vnet_hdr.gso_type & VIRTIO_NET_HDR_GSO_ECN)
1168                                 gso_type |= SKB_GSO_TCP_ECN;
1169
1170                         if (vnet_hdr.gso_size == 0)
1171                                 goto out_unlock;
1172
1173                 }
1174         }
1175
1176         err = -EMSGSIZE;
1177         if (!gso_type && (len > dev->mtu+reserve))
1178                 goto out_unlock;
1179
1180         err = -ENOBUFS;
1181         skb = packet_alloc_skb(sk, LL_ALLOCATED_SPACE(dev),
1182                                LL_RESERVED_SPACE(dev), len, vnet_hdr.hdr_len,
1183                                msg->msg_flags & MSG_DONTWAIT, &err);
1184         if (skb == NULL)
1185                 goto out_unlock;
1186
1187         skb_set_network_header(skb, reserve);
1188
1189         err = -EINVAL;
1190         if (sock->type == SOCK_DGRAM &&
1191             (offset = dev_hard_header(skb, dev, ntohs(proto), addr, NULL, len)) < 0)
1192                 goto out_free;
1193
1194         /* Returns -EFAULT on error */
1195         err = skb_copy_datagram_from_iovec(skb, offset, msg->msg_iov, 0, len);
1196         if (err)
1197                 goto out_free;
1198         err = sock_tx_timestamp(msg, sk, skb_tx(skb));
1199         if (err < 0)
1200                 goto out_free;
1201
1202         skb->protocol = proto;
1203         skb->dev = dev;
1204         skb->priority = sk->sk_priority;
1205         skb->mark = sk->sk_mark;
1206
1207         if (po->has_vnet_hdr) {
1208                 if (vnet_hdr.flags & VIRTIO_NET_HDR_F_NEEDS_CSUM) {
1209                         if (!skb_partial_csum_set(skb, vnet_hdr.csum_start,
1210                                                   vnet_hdr.csum_offset)) {
1211                                 err = -EINVAL;
1212                                 goto out_free;
1213                         }
1214                 }
1215
1216                 skb_shinfo(skb)->gso_size = vnet_hdr.gso_size;
1217                 skb_shinfo(skb)->gso_type = gso_type;
1218
1219                 /* Header must be checked, and gso_segs computed. */
1220                 skb_shinfo(skb)->gso_type |= SKB_GSO_DODGY;
1221                 skb_shinfo(skb)->gso_segs = 0;
1222
1223                 len += vnet_hdr_len;
1224         }
1225
1226         /*
1227          *      Now send it
1228          */
1229
1230         err = dev_queue_xmit(skb);
1231         if (err > 0 && (err = net_xmit_errno(err)) != 0)
1232                 goto out_unlock;
1233
1234         dev_put(dev);
1235
1236         return len;
1237
1238 out_free:
1239         kfree_skb(skb);
1240 out_unlock:
1241         if (dev)
1242                 dev_put(dev);
1243 out:
1244         return err;
1245 }
1246
1247 static int packet_sendmsg(struct kiocb *iocb, struct socket *sock,
1248                 struct msghdr *msg, size_t len)
1249 {
1250         struct sock *sk = sock->sk;
1251         struct packet_sock *po = pkt_sk(sk);
1252         if (po->tx_ring.pg_vec)
1253                 return tpacket_snd(po, msg);
1254         else
1255                 return packet_snd(sock, msg, len);
1256 }
1257
1258 /*
1259  *      Close a PACKET socket. This is fairly simple. We immediately go
1260  *      to 'closed' state and remove our protocol entry in the device list.
1261  */
1262
1263 static int packet_release(struct socket *sock)
1264 {
1265         struct sock *sk = sock->sk;
1266         struct packet_sock *po;
1267         struct net *net;
1268         struct tpacket_req req;
1269
1270         if (!sk)
1271                 return 0;
1272
1273         net = sock_net(sk);
1274         po = pkt_sk(sk);
1275
1276         spin_lock_bh(&net->packet.sklist_lock);
1277         sk_del_node_init_rcu(sk);
1278         sock_prot_inuse_add(net, sk->sk_prot, -1);
1279         spin_unlock_bh(&net->packet.sklist_lock);
1280
1281         spin_lock(&po->bind_lock);
1282         if (po->running) {
1283                 /*
1284                  * Remove from protocol table
1285                  */
1286                 po->running = 0;
1287                 po->num = 0;
1288                 __dev_remove_pack(&po->prot_hook);
1289                 __sock_put(sk);
1290         }
1291         spin_unlock(&po->bind_lock);
1292
1293         packet_flush_mclist(sk);
1294
1295         memset(&req, 0, sizeof(req));
1296
1297         if (po->rx_ring.pg_vec)
1298                 packet_set_ring(sk, &req, 1, 0);
1299
1300         if (po->tx_ring.pg_vec)
1301                 packet_set_ring(sk, &req, 1, 1);
1302
1303         synchronize_net();
1304         /*
1305          *      Now the socket is dead. No more input will appear.
1306          */
1307         sock_orphan(sk);
1308         sock->sk = NULL;
1309
1310         /* Purge queues */
1311
1312         skb_queue_purge(&sk->sk_receive_queue);
1313         sk_refcnt_debug_release(sk);
1314
1315         sock_put(sk);
1316         return 0;
1317 }
1318
1319 /*
1320  *      Attach a packet hook.
1321  */
1322
1323 static int packet_do_bind(struct sock *sk, struct net_device *dev, __be16 protocol)
1324 {
1325         struct packet_sock *po = pkt_sk(sk);
1326         /*
1327          *      Detach an existing hook if present.
1328          */
1329
1330         lock_sock(sk);
1331
1332         spin_lock(&po->bind_lock);
1333         if (po->running) {
1334                 __sock_put(sk);
1335                 po->running = 0;
1336                 po->num = 0;
1337                 spin_unlock(&po->bind_lock);
1338                 dev_remove_pack(&po->prot_hook);
1339                 spin_lock(&po->bind_lock);
1340         }
1341
1342         po->num = protocol;
1343         po->prot_hook.type = protocol;
1344         po->prot_hook.dev = dev;
1345
1346         po->ifindex = dev ? dev->ifindex : 0;
1347
1348         if (protocol == 0)
1349                 goto out_unlock;
1350
1351         if (!dev || (dev->flags & IFF_UP)) {
1352                 dev_add_pack(&po->prot_hook);
1353                 sock_hold(sk);
1354                 po->running = 1;
1355         } else {
1356                 sk->sk_err = ENETDOWN;
1357                 if (!sock_flag(sk, SOCK_DEAD))
1358                         sk->sk_error_report(sk);
1359         }
1360
1361 out_unlock:
1362         spin_unlock(&po->bind_lock);
1363         release_sock(sk);
1364         return 0;
1365 }
1366
1367 /*
1368  *      Bind a packet socket to a device
1369  */
1370
1371 static int packet_bind_spkt(struct socket *sock, struct sockaddr *uaddr,
1372                             int addr_len)
1373 {
1374         struct sock *sk = sock->sk;
1375         char name[15];
1376         struct net_device *dev;
1377         int err = -ENODEV;
1378
1379         /*
1380          *      Check legality
1381          */
1382
1383         if (addr_len != sizeof(struct sockaddr))
1384                 return -EINVAL;
1385         strlcpy(name, uaddr->sa_data, sizeof(name));
1386
1387         dev = dev_get_by_name(sock_net(sk), name);
1388         if (dev) {
1389                 err = packet_do_bind(sk, dev, pkt_sk(sk)->num);
1390                 dev_put(dev);
1391         }
1392         return err;
1393 }
1394
1395 static int packet_bind(struct socket *sock, struct sockaddr *uaddr, int addr_len)
1396 {
1397         struct sockaddr_ll *sll = (struct sockaddr_ll *)uaddr;
1398         struct sock *sk = sock->sk;
1399         struct net_device *dev = NULL;
1400         int err;
1401
1402
1403         /*
1404          *      Check legality
1405          */
1406
1407         if (addr_len < sizeof(struct sockaddr_ll))
1408                 return -EINVAL;
1409         if (sll->sll_family != AF_PACKET)
1410                 return -EINVAL;
1411
1412         if (sll->sll_ifindex) {
1413                 err = -ENODEV;
1414                 dev = dev_get_by_index(sock_net(sk), sll->sll_ifindex);
1415                 if (dev == NULL)
1416                         goto out;
1417         }
1418         err = packet_do_bind(sk, dev, sll->sll_protocol ? : pkt_sk(sk)->num);
1419         if (dev)
1420                 dev_put(dev);
1421
1422 out:
1423         return err;
1424 }
1425
1426 static struct proto packet_proto = {
1427         .name     = "PACKET",
1428         .owner    = THIS_MODULE,
1429         .obj_size = sizeof(struct packet_sock),
1430 };
1431
1432 /*
1433  *      Create a packet of type SOCK_PACKET.
1434  */
1435
1436 static int packet_create(struct net *net, struct socket *sock, int protocol,
1437                          int kern)
1438 {
1439         struct sock *sk;
1440         struct packet_sock *po;
1441         __be16 proto = (__force __be16)protocol; /* weird, but documented */
1442         int err;
1443
1444         if (!capable(CAP_NET_RAW))
1445                 return -EPERM;
1446         if (sock->type != SOCK_DGRAM && sock->type != SOCK_RAW &&
1447             sock->type != SOCK_PACKET)
1448                 return -ESOCKTNOSUPPORT;
1449
1450         sock->state = SS_UNCONNECTED;
1451
1452         err = -ENOBUFS;
1453         sk = sk_alloc(net, PF_PACKET, GFP_KERNEL, &packet_proto);
1454         if (sk == NULL)
1455                 goto out;
1456
1457         sock->ops = &packet_ops;
1458         if (sock->type == SOCK_PACKET)
1459                 sock->ops = &packet_ops_spkt;
1460
1461         sock_init_data(sock, sk);
1462
1463         po = pkt_sk(sk);
1464         sk->sk_family = PF_PACKET;
1465         po->num = proto;
1466
1467         sk->sk_destruct = packet_sock_destruct;
1468         sk_refcnt_debug_inc(sk);
1469
1470         /*
1471          *      Attach a protocol block
1472          */
1473
1474         spin_lock_init(&po->bind_lock);
1475         mutex_init(&po->pg_vec_lock);
1476         po->prot_hook.func = packet_rcv;
1477
1478         if (sock->type == SOCK_PACKET)
1479                 po->prot_hook.func = packet_rcv_spkt;
1480
1481         po->prot_hook.af_packet_priv = sk;
1482
1483         if (proto) {
1484                 po->prot_hook.type = proto;
1485                 dev_add_pack(&po->prot_hook);
1486                 sock_hold(sk);
1487                 po->running = 1;
1488         }
1489
1490         spin_lock_bh(&net->packet.sklist_lock);
1491         sk_add_node_rcu(sk, &net->packet.sklist);
1492         sock_prot_inuse_add(net, &packet_proto, 1);
1493         spin_unlock_bh(&net->packet.sklist_lock);
1494
1495         return 0;
1496 out:
1497         return err;
1498 }
1499
1500 static int packet_recv_error(struct sock *sk, struct msghdr *msg, int len)
1501 {
1502         struct sock_exterr_skb *serr;
1503         struct sk_buff *skb, *skb2;
1504         int copied, err;
1505
1506         err = -EAGAIN;
1507         skb = skb_dequeue(&sk->sk_error_queue);
1508         if (skb == NULL)
1509                 goto out;
1510
1511         copied = skb->len;
1512         if (copied > len) {
1513                 msg->msg_flags |= MSG_TRUNC;
1514                 copied = len;
1515         }
1516         err = skb_copy_datagram_iovec(skb, 0, msg->msg_iov, copied);
1517         if (err)
1518                 goto out_free_skb;
1519
1520         sock_recv_timestamp(msg, sk, skb);
1521
1522         serr = SKB_EXT_ERR(skb);
1523         put_cmsg(msg, SOL_PACKET, PACKET_TX_TIMESTAMP,
1524                  sizeof(serr->ee), &serr->ee);
1525
1526         msg->msg_flags |= MSG_ERRQUEUE;
1527         err = copied;
1528
1529         /* Reset and regenerate socket error */
1530         spin_lock_bh(&sk->sk_error_queue.lock);
1531         sk->sk_err = 0;
1532         if ((skb2 = skb_peek(&sk->sk_error_queue)) != NULL) {
1533                 sk->sk_err = SKB_EXT_ERR(skb2)->ee.ee_errno;
1534                 spin_unlock_bh(&sk->sk_error_queue.lock);
1535                 sk->sk_error_report(sk);
1536         } else
1537                 spin_unlock_bh(&sk->sk_error_queue.lock);
1538
1539 out_free_skb:
1540         kfree_skb(skb);
1541 out:
1542         return err;
1543 }
1544
1545 /*
1546  *      Pull a packet from our receive queue and hand it to the user.
1547  *      If necessary we block.
1548  */
1549
1550 static int packet_recvmsg(struct kiocb *iocb, struct socket *sock,
1551                           struct msghdr *msg, size_t len, int flags)
1552 {
1553         struct sock *sk = sock->sk;
1554         struct sk_buff *skb;
1555         int copied, err;
1556         struct sockaddr_ll *sll;
1557         int vnet_hdr_len = 0;
1558
1559         err = -EINVAL;
1560         if (flags & ~(MSG_PEEK|MSG_DONTWAIT|MSG_TRUNC|MSG_CMSG_COMPAT|MSG_ERRQUEUE))
1561                 goto out;
1562
1563 #if 0
1564         /* What error should we return now? EUNATTACH? */
1565         if (pkt_sk(sk)->ifindex < 0)
1566                 return -ENODEV;
1567 #endif
1568
1569         if (flags & MSG_ERRQUEUE) {
1570                 err = packet_recv_error(sk, msg, len);
1571                 goto out;
1572         }
1573
1574         /*
1575          *      Call the generic datagram receiver. This handles all sorts
1576          *      of horrible races and re-entrancy so we can forget about it
1577          *      in the protocol layers.
1578          *
1579          *      Now it will return ENETDOWN, if device have just gone down,
1580          *      but then it will block.
1581          */
1582
1583         skb = skb_recv_datagram(sk, flags, flags & MSG_DONTWAIT, &err);
1584
1585         /*
1586          *      An error occurred so return it. Because skb_recv_datagram()
1587          *      handles the blocking we don't see and worry about blocking
1588          *      retries.
1589          */
1590
1591         if (skb == NULL)
1592                 goto out;
1593
1594         if (pkt_sk(sk)->has_vnet_hdr) {
1595                 struct virtio_net_hdr vnet_hdr = { 0 };
1596
1597                 err = -EINVAL;
1598                 vnet_hdr_len = sizeof(vnet_hdr);
1599                 if (len < vnet_hdr_len)
1600                         goto out_free;
1601
1602                 len -= vnet_hdr_len;
1603
1604                 if (skb_is_gso(skb)) {
1605                         struct skb_shared_info *sinfo = skb_shinfo(skb);
1606
1607                         /* This is a hint as to how much should be linear. */
1608                         vnet_hdr.hdr_len = skb_headlen(skb);
1609                         vnet_hdr.gso_size = sinfo->gso_size;
1610                         if (sinfo->gso_type & SKB_GSO_TCPV4)
1611                                 vnet_hdr.gso_type = VIRTIO_NET_HDR_GSO_TCPV4;
1612                         else if (sinfo->gso_type & SKB_GSO_TCPV6)
1613                                 vnet_hdr.gso_type = VIRTIO_NET_HDR_GSO_TCPV6;
1614                         else if (sinfo->gso_type & SKB_GSO_UDP)
1615                                 vnet_hdr.gso_type = VIRTIO_NET_HDR_GSO_UDP;
1616                         else if (sinfo->gso_type & SKB_GSO_FCOE)
1617                                 goto out_free;
1618                         else
1619                                 BUG();
1620                         if (sinfo->gso_type & SKB_GSO_TCP_ECN)
1621                                 vnet_hdr.gso_type |= VIRTIO_NET_HDR_GSO_ECN;
1622                 } else
1623                         vnet_hdr.gso_type = VIRTIO_NET_HDR_GSO_NONE;
1624
1625                 if (skb->ip_summed == CHECKSUM_PARTIAL) {
1626                         vnet_hdr.flags = VIRTIO_NET_HDR_F_NEEDS_CSUM;
1627                         vnet_hdr.csum_start = skb->csum_start -
1628                                                         skb_headroom(skb);
1629                         vnet_hdr.csum_offset = skb->csum_offset;
1630                 } /* else everything is zero */
1631
1632                 err = memcpy_toiovec(msg->msg_iov, (void *)&vnet_hdr,
1633                                      vnet_hdr_len);
1634                 if (err < 0)
1635                         goto out_free;
1636         }
1637
1638         /*
1639          *      If the address length field is there to be filled in, we fill
1640          *      it in now.
1641          */
1642
1643         sll = &PACKET_SKB_CB(skb)->sa.ll;
1644         if (sock->type == SOCK_PACKET)
1645                 msg->msg_namelen = sizeof(struct sockaddr_pkt);
1646         else
1647                 msg->msg_namelen = sll->sll_halen + offsetof(struct sockaddr_ll, sll_addr);
1648
1649         /*
1650          *      You lose any data beyond the buffer you gave. If it worries a
1651          *      user program they can ask the device for its MTU anyway.
1652          */
1653
1654         copied = skb->len;
1655         if (copied > len) {
1656                 copied = len;
1657                 msg->msg_flags |= MSG_TRUNC;
1658         }
1659
1660         err = skb_copy_datagram_iovec(skb, 0, msg->msg_iov, copied);
1661         if (err)
1662                 goto out_free;
1663
1664         sock_recv_ts_and_drops(msg, sk, skb);
1665
1666         if (msg->msg_name)
1667                 memcpy(msg->msg_name, &PACKET_SKB_CB(skb)->sa,
1668                        msg->msg_namelen);
1669
1670         if (pkt_sk(sk)->auxdata) {
1671                 struct tpacket_auxdata aux;
1672
1673                 aux.tp_status = TP_STATUS_USER;
1674                 if (skb->ip_summed == CHECKSUM_PARTIAL)
1675                         aux.tp_status |= TP_STATUS_CSUMNOTREADY;
1676                 aux.tp_len = PACKET_SKB_CB(skb)->origlen;
1677                 aux.tp_snaplen = skb->len;
1678                 aux.tp_mac = 0;
1679                 aux.tp_net = skb_network_offset(skb);
1680                 aux.tp_vlan_tci = vlan_tx_tag_get(skb);
1681
1682                 aux.tp_padding = 0;
1683                 put_cmsg(msg, SOL_PACKET, PACKET_AUXDATA, sizeof(aux), &aux);
1684         }
1685
1686         /*
1687          *      Free or return the buffer as appropriate. Again this
1688          *      hides all the races and re-entrancy issues from us.
1689          */
1690         err = vnet_hdr_len + ((flags&MSG_TRUNC) ? skb->len : copied);
1691
1692 out_free:
1693         skb_free_datagram(sk, skb);
1694 out:
1695         return err;
1696 }
1697
1698 static int packet_getname_spkt(struct socket *sock, struct sockaddr *uaddr,
1699                                int *uaddr_len, int peer)
1700 {
1701         struct net_device *dev;
1702         struct sock *sk = sock->sk;
1703
1704         if (peer)
1705                 return -EOPNOTSUPP;
1706
1707         uaddr->sa_family = AF_PACKET;
1708         rcu_read_lock();
1709         dev = dev_get_by_index_rcu(sock_net(sk), pkt_sk(sk)->ifindex);
1710         if (dev)
1711                 strncpy(uaddr->sa_data, dev->name, 14);
1712         else
1713                 memset(uaddr->sa_data, 0, 14);
1714         rcu_read_unlock();
1715         *uaddr_len = sizeof(*uaddr);
1716
1717         return 0;
1718 }
1719
1720 static int packet_getname(struct socket *sock, struct sockaddr *uaddr,
1721                           int *uaddr_len, int peer)
1722 {
1723         struct net_device *dev;
1724         struct sock *sk = sock->sk;
1725         struct packet_sock *po = pkt_sk(sk);
1726         DECLARE_SOCKADDR(struct sockaddr_ll *, sll, uaddr);
1727
1728         if (peer)
1729                 return -EOPNOTSUPP;
1730
1731         sll->sll_family = AF_PACKET;
1732         sll->sll_ifindex = po->ifindex;
1733         sll->sll_protocol = po->num;
1734         sll->sll_pkttype = 0;
1735         rcu_read_lock();
1736         dev = dev_get_by_index_rcu(sock_net(sk), po->ifindex);
1737         if (dev) {
1738                 sll->sll_hatype = dev->type;
1739                 sll->sll_halen = dev->addr_len;
1740                 memcpy(sll->sll_addr, dev->dev_addr, dev->addr_len);
1741         } else {
1742                 sll->sll_hatype = 0;    /* Bad: we have no ARPHRD_UNSPEC */
1743                 sll->sll_halen = 0;
1744         }
1745         rcu_read_unlock();
1746         *uaddr_len = offsetof(struct sockaddr_ll, sll_addr) + sll->sll_halen;
1747
1748         return 0;
1749 }
1750
1751 static int packet_dev_mc(struct net_device *dev, struct packet_mclist *i,
1752                          int what)
1753 {
1754         switch (i->type) {
1755         case PACKET_MR_MULTICAST:
1756                 if (i->alen != dev->addr_len)
1757                         return -EINVAL;
1758                 if (what > 0)
1759                         return dev_mc_add(dev, i->addr);
1760                 else
1761                         return dev_mc_del(dev, i->addr);
1762                 break;
1763         case PACKET_MR_PROMISC:
1764                 return dev_set_promiscuity(dev, what);
1765                 break;
1766         case PACKET_MR_ALLMULTI:
1767                 return dev_set_allmulti(dev, what);
1768                 break;
1769         case PACKET_MR_UNICAST:
1770                 if (i->alen != dev->addr_len)
1771                         return -EINVAL;
1772                 if (what > 0)
1773                         return dev_uc_add(dev, i->addr);
1774                 else
1775                         return dev_uc_del(dev, i->addr);
1776                 break;
1777         default:
1778                 break;
1779         }
1780         return 0;
1781 }
1782
1783 static void packet_dev_mclist(struct net_device *dev, struct packet_mclist *i, int what)
1784 {
1785         for ( ; i; i = i->next) {
1786                 if (i->ifindex == dev->ifindex)
1787                         packet_dev_mc(dev, i, what);
1788         }
1789 }
1790
1791 static int packet_mc_add(struct sock *sk, struct packet_mreq_max *mreq)
1792 {
1793         struct packet_sock *po = pkt_sk(sk);
1794         struct packet_mclist *ml, *i;
1795         struct net_device *dev;
1796         int err;
1797
1798         rtnl_lock();
1799
1800         err = -ENODEV;
1801         dev = __dev_get_by_index(sock_net(sk), mreq->mr_ifindex);
1802         if (!dev)
1803                 goto done;
1804
1805         err = -EINVAL;
1806         if (mreq->mr_alen > dev->addr_len)
1807                 goto done;
1808
1809         err = -ENOBUFS;
1810         i = kmalloc(sizeof(*i), GFP_KERNEL);
1811         if (i == NULL)
1812                 goto done;
1813
1814         err = 0;
1815         for (ml = po->mclist; ml; ml = ml->next) {
1816                 if (ml->ifindex == mreq->mr_ifindex &&
1817                     ml->type == mreq->mr_type &&
1818                     ml->alen == mreq->mr_alen &&
1819                     memcmp(ml->addr, mreq->mr_address, ml->alen) == 0) {
1820                         ml->count++;
1821                         /* Free the new element ... */
1822                         kfree(i);
1823                         goto done;
1824                 }
1825         }
1826
1827         i->type = mreq->mr_type;
1828         i->ifindex = mreq->mr_ifindex;
1829         i->alen = mreq->mr_alen;
1830         memcpy(i->addr, mreq->mr_address, i->alen);
1831         i->count = 1;
1832         i->next = po->mclist;
1833         po->mclist = i;
1834         err = packet_dev_mc(dev, i, 1);
1835         if (err) {
1836                 po->mclist = i->next;
1837                 kfree(i);
1838         }
1839
1840 done:
1841         rtnl_unlock();
1842         return err;
1843 }
1844
1845 static int packet_mc_drop(struct sock *sk, struct packet_mreq_max *mreq)
1846 {
1847         struct packet_mclist *ml, **mlp;
1848
1849         rtnl_lock();
1850
1851         for (mlp = &pkt_sk(sk)->mclist; (ml = *mlp) != NULL; mlp = &ml->next) {
1852                 if (ml->ifindex == mreq->mr_ifindex &&
1853                     ml->type == mreq->mr_type &&
1854                     ml->alen == mreq->mr_alen &&
1855                     memcmp(ml->addr, mreq->mr_address, ml->alen) == 0) {
1856                         if (--ml->count == 0) {
1857                                 struct net_device *dev;
1858                                 *mlp = ml->next;
1859                                 dev = __dev_get_by_index(sock_net(sk), ml->ifindex);
1860                                 if (dev)
1861                                         packet_dev_mc(dev, ml, -1);
1862                                 kfree(ml);
1863                         }
1864                         rtnl_unlock();
1865                         return 0;
1866                 }
1867         }
1868         rtnl_unlock();
1869         return -EADDRNOTAVAIL;
1870 }
1871
1872 static void packet_flush_mclist(struct sock *sk)
1873 {
1874         struct packet_sock *po = pkt_sk(sk);
1875         struct packet_mclist *ml;
1876
1877         if (!po->mclist)
1878                 return;
1879
1880         rtnl_lock();
1881         while ((ml = po->mclist) != NULL) {
1882                 struct net_device *dev;
1883
1884                 po->mclist = ml->next;
1885                 dev = __dev_get_by_index(sock_net(sk), ml->ifindex);
1886                 if (dev != NULL)
1887                         packet_dev_mc(dev, ml, -1);
1888                 kfree(ml);
1889         }
1890         rtnl_unlock();
1891 }
1892
1893 static int
1894 packet_setsockopt(struct socket *sock, int level, int optname, char __user *optval, unsigned int optlen)
1895 {
1896         struct sock *sk = sock->sk;
1897         struct packet_sock *po = pkt_sk(sk);
1898         int ret;
1899
1900         if (level != SOL_PACKET)
1901                 return -ENOPROTOOPT;
1902
1903         switch (optname) {
1904         case PACKET_ADD_MEMBERSHIP:
1905         case PACKET_DROP_MEMBERSHIP:
1906         {
1907                 struct packet_mreq_max mreq;
1908                 int len = optlen;
1909                 memset(&mreq, 0, sizeof(mreq));
1910                 if (len < sizeof(struct packet_mreq))
1911                         return -EINVAL;
1912                 if (len > sizeof(mreq))
1913                         len = sizeof(mreq);
1914                 if (copy_from_user(&mreq, optval, len))
1915                         return -EFAULT;
1916                 if (len < (mreq.mr_alen + offsetof(struct packet_mreq, mr_address)))
1917                         return -EINVAL;
1918                 if (optname == PACKET_ADD_MEMBERSHIP)
1919                         ret = packet_mc_add(sk, &mreq);
1920                 else
1921                         ret = packet_mc_drop(sk, &mreq);
1922                 return ret;
1923         }
1924
1925         case PACKET_RX_RING:
1926         case PACKET_TX_RING:
1927         {
1928                 struct tpacket_req req;
1929
1930                 if (optlen < sizeof(req))
1931                         return -EINVAL;
1932                 if (pkt_sk(sk)->has_vnet_hdr)
1933                         return -EINVAL;
1934                 if (copy_from_user(&req, optval, sizeof(req)))
1935                         return -EFAULT;
1936                 return packet_set_ring(sk, &req, 0, optname == PACKET_TX_RING);
1937         }
1938         case PACKET_COPY_THRESH:
1939         {
1940                 int val;
1941
1942                 if (optlen != sizeof(val))
1943                         return -EINVAL;
1944                 if (copy_from_user(&val, optval, sizeof(val)))
1945                         return -EFAULT;
1946
1947                 pkt_sk(sk)->copy_thresh = val;
1948                 return 0;
1949         }
1950         case PACKET_VERSION:
1951         {
1952                 int val;
1953
1954                 if (optlen != sizeof(val))
1955                         return -EINVAL;
1956                 if (po->rx_ring.pg_vec || po->tx_ring.pg_vec)
1957                         return -EBUSY;
1958                 if (copy_from_user(&val, optval, sizeof(val)))
1959                         return -EFAULT;
1960                 switch (val) {
1961                 case TPACKET_V1:
1962                 case TPACKET_V2:
1963                         po->tp_version = val;
1964                         return 0;
1965                 default:
1966                         return -EINVAL;
1967                 }
1968         }
1969         case PACKET_RESERVE:
1970         {
1971                 unsigned int val;
1972
1973                 if (optlen != sizeof(val))
1974                         return -EINVAL;
1975                 if (po->rx_ring.pg_vec || po->tx_ring.pg_vec)
1976                         return -EBUSY;
1977                 if (copy_from_user(&val, optval, sizeof(val)))
1978                         return -EFAULT;
1979                 po->tp_reserve = val;
1980                 return 0;
1981         }
1982         case PACKET_LOSS:
1983         {
1984                 unsigned int val;
1985
1986                 if (optlen != sizeof(val))
1987                         return -EINVAL;
1988                 if (po->rx_ring.pg_vec || po->tx_ring.pg_vec)
1989                         return -EBUSY;
1990                 if (copy_from_user(&val, optval, sizeof(val)))
1991                         return -EFAULT;
1992                 po->tp_loss = !!val;
1993                 return 0;
1994         }
1995         case PACKET_AUXDATA:
1996         {
1997                 int val;
1998
1999                 if (optlen < sizeof(val))
2000                         return -EINVAL;
2001                 if (copy_from_user(&val, optval, sizeof(val)))
2002                         return -EFAULT;
2003
2004                 po->auxdata = !!val;
2005                 return 0;
2006         }
2007         case PACKET_ORIGDEV:
2008         {
2009                 int val;
2010
2011                 if (optlen < sizeof(val))
2012                         return -EINVAL;
2013                 if (copy_from_user(&val, optval, sizeof(val)))
2014                         return -EFAULT;
2015
2016                 po->origdev = !!val;
2017                 return 0;
2018         }
2019         case PACKET_VNET_HDR:
2020         {
2021                 int val;
2022
2023                 if (sock->type != SOCK_RAW)
2024                         return -EINVAL;
2025                 if (po->rx_ring.pg_vec || po->tx_ring.pg_vec)
2026                         return -EBUSY;
2027                 if (optlen < sizeof(val))
2028                         return -EINVAL;
2029                 if (copy_from_user(&val, optval, sizeof(val)))
2030                         return -EFAULT;
2031
2032                 po->has_vnet_hdr = !!val;
2033                 return 0;
2034         }
2035         default:
2036                 return -ENOPROTOOPT;
2037         }
2038 }
2039
2040 static int packet_getsockopt(struct socket *sock, int level, int optname,
2041                              char __user *optval, int __user *optlen)
2042 {
2043         int len;
2044         int val;
2045         struct sock *sk = sock->sk;
2046         struct packet_sock *po = pkt_sk(sk);
2047         void *data;
2048         struct tpacket_stats st;
2049
2050         if (level != SOL_PACKET)
2051                 return -ENOPROTOOPT;
2052
2053         if (get_user(len, optlen))
2054                 return -EFAULT;
2055
2056         if (len < 0)
2057                 return -EINVAL;
2058
2059         switch (optname) {
2060         case PACKET_STATISTICS:
2061                 if (len > sizeof(struct tpacket_stats))
2062                         len = sizeof(struct tpacket_stats);
2063                 spin_lock_bh(&sk->sk_receive_queue.lock);
2064                 st = po->stats;
2065                 memset(&po->stats, 0, sizeof(st));
2066                 spin_unlock_bh(&sk->sk_receive_queue.lock);
2067                 st.tp_packets += st.tp_drops;
2068
2069                 data = &st;
2070                 break;
2071         case PACKET_AUXDATA:
2072                 if (len > sizeof(int))
2073                         len = sizeof(int);
2074                 val = po->auxdata;
2075
2076                 data = &val;
2077                 break;
2078         case PACKET_ORIGDEV:
2079                 if (len > sizeof(int))
2080                         len = sizeof(int);
2081                 val = po->origdev;
2082
2083                 data = &val;
2084                 break;
2085         case PACKET_VNET_HDR:
2086                 if (len > sizeof(int))
2087                         len = sizeof(int);
2088                 val = po->has_vnet_hdr;
2089
2090                 data = &val;
2091                 break;
2092         case PACKET_VERSION:
2093                 if (len > sizeof(int))
2094                         len = sizeof(int);
2095                 val = po->tp_version;
2096                 data = &val;
2097                 break;
2098         case PACKET_HDRLEN:
2099                 if (len > sizeof(int))
2100                         len = sizeof(int);
2101                 if (copy_from_user(&val, optval, len))
2102                         return -EFAULT;
2103                 switch (val) {
2104                 case TPACKET_V1:
2105                         val = sizeof(struct tpacket_hdr);
2106                         break;
2107                 case TPACKET_V2:
2108                         val = sizeof(struct tpacket2_hdr);
2109                         break;
2110                 default:
2111                         return -EINVAL;
2112                 }
2113                 data = &val;
2114                 break;
2115         case PACKET_RESERVE:
2116                 if (len > sizeof(unsigned int))
2117                         len = sizeof(unsigned int);
2118                 val = po->tp_reserve;
2119                 data = &val;
2120                 break;
2121         case PACKET_LOSS:
2122                 if (len > sizeof(unsigned int))
2123                         len = sizeof(unsigned int);
2124                 val = po->tp_loss;
2125                 data = &val;
2126                 break;
2127         default:
2128                 return -ENOPROTOOPT;
2129         }
2130
2131         if (put_user(len, optlen))
2132                 return -EFAULT;
2133         if (copy_to_user(optval, data, len))
2134                 return -EFAULT;
2135         return 0;
2136 }
2137
2138
2139 static int packet_notifier(struct notifier_block *this, unsigned long msg, void *data)
2140 {
2141         struct sock *sk;
2142         struct hlist_node *node;
2143         struct net_device *dev = data;
2144         struct net *net = dev_net(dev);
2145
2146         rcu_read_lock();
2147         sk_for_each_rcu(sk, node, &net->packet.sklist) {
2148                 struct packet_sock *po = pkt_sk(sk);
2149
2150                 switch (msg) {
2151                 case NETDEV_UNREGISTER:
2152                         if (po->mclist)
2153                                 packet_dev_mclist(dev, po->mclist, -1);
2154                         /* fallthrough */
2155
2156                 case NETDEV_DOWN:
2157                         if (dev->ifindex == po->ifindex) {
2158                                 spin_lock(&po->bind_lock);
2159                                 if (po->running) {
2160                                         __dev_remove_pack(&po->prot_hook);
2161                                         __sock_put(sk);
2162                                         po->running = 0;
2163                                         sk->sk_err = ENETDOWN;
2164                                         if (!sock_flag(sk, SOCK_DEAD))
2165                                                 sk->sk_error_report(sk);
2166                                 }
2167                                 if (msg == NETDEV_UNREGISTER) {
2168                                         po->ifindex = -1;
2169                                         po->prot_hook.dev = NULL;
2170                                 }
2171                                 spin_unlock(&po->bind_lock);
2172                         }
2173                         break;
2174                 case NETDEV_UP:
2175                         if (dev->ifindex == po->ifindex) {
2176                                 spin_lock(&po->bind_lock);
2177                                 if (po->num && !po->running) {
2178                                         dev_add_pack(&po->prot_hook);
2179                                         sock_hold(sk);
2180                                         po->running = 1;
2181                                 }
2182                                 spin_unlock(&po->bind_lock);
2183                         }
2184                         break;
2185                 }
2186         }
2187         rcu_read_unlock();
2188         return NOTIFY_DONE;
2189 }
2190
2191
2192 static int packet_ioctl(struct socket *sock, unsigned int cmd,
2193                         unsigned long arg)
2194 {
2195         struct sock *sk = sock->sk;
2196
2197         switch (cmd) {
2198         case SIOCOUTQ:
2199         {
2200                 int amount = sk_wmem_alloc_get(sk);
2201
2202                 return put_user(amount, (int __user *)arg);
2203         }
2204         case SIOCINQ:
2205         {
2206                 struct sk_buff *skb;
2207                 int amount = 0;
2208
2209                 spin_lock_bh(&sk->sk_receive_queue.lock);
2210                 skb = skb_peek(&sk->sk_receive_queue);
2211                 if (skb)
2212                         amount = skb->len;
2213                 spin_unlock_bh(&sk->sk_receive_queue.lock);
2214                 return put_user(amount, (int __user *)arg);
2215         }
2216         case SIOCGSTAMP:
2217                 return sock_get_timestamp(sk, (struct timeval __user *)arg);
2218         case SIOCGSTAMPNS:
2219                 return sock_get_timestampns(sk, (struct timespec __user *)arg);
2220
2221 #ifdef CONFIG_INET
2222         case SIOCADDRT:
2223         case SIOCDELRT:
2224         case SIOCDARP:
2225         case SIOCGARP:
2226         case SIOCSARP:
2227         case SIOCGIFADDR:
2228         case SIOCSIFADDR:
2229         case SIOCGIFBRDADDR:
2230         case SIOCSIFBRDADDR:
2231         case SIOCGIFNETMASK:
2232         case SIOCSIFNETMASK:
2233         case SIOCGIFDSTADDR:
2234         case SIOCSIFDSTADDR:
2235         case SIOCSIFFLAGS:
2236                 return inet_dgram_ops.ioctl(sock, cmd, arg);
2237 #endif
2238
2239         default:
2240                 return -ENOIOCTLCMD;
2241         }
2242         return 0;
2243 }
2244
2245 static unsigned int packet_poll(struct file *file, struct socket *sock,
2246                                 poll_table *wait)
2247 {
2248         struct sock *sk = sock->sk;
2249         struct packet_sock *po = pkt_sk(sk);
2250         unsigned int mask = datagram_poll(file, sock, wait);
2251
2252         spin_lock_bh(&sk->sk_receive_queue.lock);
2253         if (po->rx_ring.pg_vec) {
2254                 if (!packet_previous_frame(po, &po->rx_ring, TP_STATUS_KERNEL))
2255                         mask |= POLLIN | POLLRDNORM;
2256         }
2257         spin_unlock_bh(&sk->sk_receive_queue.lock);
2258         spin_lock_bh(&sk->sk_write_queue.lock);
2259         if (po->tx_ring.pg_vec) {
2260                 if (packet_current_frame(po, &po->tx_ring, TP_STATUS_AVAILABLE))
2261                         mask |= POLLOUT | POLLWRNORM;
2262         }
2263         spin_unlock_bh(&sk->sk_write_queue.lock);
2264         return mask;
2265 }
2266
2267
2268 /* Dirty? Well, I still did not learn better way to account
2269  * for user mmaps.
2270  */
2271
2272 static void packet_mm_open(struct vm_area_struct *vma)
2273 {
2274         struct file *file = vma->vm_file;
2275         struct socket *sock = file->private_data;
2276         struct sock *sk = sock->sk;
2277
2278         if (sk)
2279                 atomic_inc(&pkt_sk(sk)->mapped);
2280 }
2281
2282 static void packet_mm_close(struct vm_area_struct *vma)
2283 {
2284         struct file *file = vma->vm_file;
2285         struct socket *sock = file->private_data;
2286         struct sock *sk = sock->sk;
2287
2288         if (sk)
2289                 atomic_dec(&pkt_sk(sk)->mapped);
2290 }
2291
2292 static const struct vm_operations_struct packet_mmap_ops = {
2293         .open   =       packet_mm_open,
2294         .close  =       packet_mm_close,
2295 };
2296
2297 static void free_pg_vec(char **pg_vec, unsigned int order, unsigned int len)
2298 {
2299         int i;
2300
2301         for (i = 0; i < len; i++) {
2302                 if (likely(pg_vec[i]))
2303                         free_pages((unsigned long) pg_vec[i], order);
2304         }
2305         kfree(pg_vec);
2306 }
2307
2308 static inline char *alloc_one_pg_vec_page(unsigned long order)
2309 {
2310         gfp_t gfp_flags = GFP_KERNEL | __GFP_COMP | __GFP_ZERO | __GFP_NOWARN;
2311
2312         return (char *) __get_free_pages(gfp_flags, order);
2313 }
2314
2315 static char **alloc_pg_vec(struct tpacket_req *req, int order)
2316 {
2317         unsigned int block_nr = req->tp_block_nr;
2318         char **pg_vec;
2319         int i;
2320
2321         pg_vec = kzalloc(block_nr * sizeof(char *), GFP_KERNEL);
2322         if (unlikely(!pg_vec))
2323                 goto out;
2324
2325         for (i = 0; i < block_nr; i++) {
2326                 pg_vec[i] = alloc_one_pg_vec_page(order);
2327                 if (unlikely(!pg_vec[i]))
2328                         goto out_free_pgvec;
2329         }
2330
2331 out:
2332         return pg_vec;
2333
2334 out_free_pgvec:
2335         free_pg_vec(pg_vec, order, block_nr);
2336         pg_vec = NULL;
2337         goto out;
2338 }
2339
2340 static int packet_set_ring(struct sock *sk, struct tpacket_req *req,
2341                 int closing, int tx_ring)
2342 {
2343         char **pg_vec = NULL;
2344         struct packet_sock *po = pkt_sk(sk);
2345         int was_running, order = 0;
2346         struct packet_ring_buffer *rb;
2347         struct sk_buff_head *rb_queue;
2348         __be16 num;
2349         int err;
2350
2351         rb = tx_ring ? &po->tx_ring : &po->rx_ring;
2352         rb_queue = tx_ring ? &sk->sk_write_queue : &sk->sk_receive_queue;
2353
2354         err = -EBUSY;
2355         if (!closing) {
2356                 if (atomic_read(&po->mapped))
2357                         goto out;
2358                 if (atomic_read(&rb->pending))
2359                         goto out;
2360         }
2361
2362         if (req->tp_block_nr) {
2363                 /* Sanity tests and some calculations */
2364                 err = -EBUSY;
2365                 if (unlikely(rb->pg_vec))
2366                         goto out;
2367
2368                 switch (po->tp_version) {
2369                 case TPACKET_V1:
2370                         po->tp_hdrlen = TPACKET_HDRLEN;
2371                         break;
2372                 case TPACKET_V2:
2373                         po->tp_hdrlen = TPACKET2_HDRLEN;
2374                         break;
2375                 }
2376
2377                 err = -EINVAL;
2378                 if (unlikely((int)req->tp_block_size <= 0))
2379                         goto out;
2380                 if (unlikely(req->tp_block_size & (PAGE_SIZE - 1)))
2381                         goto out;
2382                 if (unlikely(req->tp_frame_size < po->tp_hdrlen +
2383                                         po->tp_reserve))
2384                         goto out;
2385                 if (unlikely(req->tp_frame_size & (TPACKET_ALIGNMENT - 1)))
2386                         goto out;
2387
2388                 rb->frames_per_block = req->tp_block_size/req->tp_frame_size;
2389                 if (unlikely(rb->frames_per_block <= 0))
2390                         goto out;
2391                 if (unlikely((rb->frames_per_block * req->tp_block_nr) !=
2392                                         req->tp_frame_nr))
2393                         goto out;
2394
2395                 err = -ENOMEM;
2396                 order = get_order(req->tp_block_size);
2397                 pg_vec = alloc_pg_vec(req, order);
2398                 if (unlikely(!pg_vec))
2399                         goto out;
2400         }
2401         /* Done */
2402         else {
2403                 err = -EINVAL;
2404                 if (unlikely(req->tp_frame_nr))
2405                         goto out;
2406         }
2407
2408         lock_sock(sk);
2409
2410         /* Detach socket from network */
2411         spin_lock(&po->bind_lock);
2412         was_running = po->running;
2413         num = po->num;
2414         if (was_running) {
2415                 __dev_remove_pack(&po->prot_hook);
2416                 po->num = 0;
2417                 po->running = 0;
2418                 __sock_put(sk);
2419         }
2420         spin_unlock(&po->bind_lock);
2421
2422         synchronize_net();
2423
2424         err = -EBUSY;
2425         mutex_lock(&po->pg_vec_lock);
2426         if (closing || atomic_read(&po->mapped) == 0) {
2427                 err = 0;
2428 #define XC(a, b) ({ __typeof__ ((a)) __t; __t = (a); (a) = (b); __t; })
2429                 spin_lock_bh(&rb_queue->lock);
2430                 pg_vec = XC(rb->pg_vec, pg_vec);
2431                 rb->frame_max = (req->tp_frame_nr - 1);
2432                 rb->head = 0;
2433                 rb->frame_size = req->tp_frame_size;
2434                 spin_unlock_bh(&rb_queue->lock);
2435
2436                 order = XC(rb->pg_vec_order, order);
2437                 req->tp_block_nr = XC(rb->pg_vec_len, req->tp_block_nr);
2438
2439                 rb->pg_vec_pages = req->tp_block_size/PAGE_SIZE;
2440                 po->prot_hook.func = (po->rx_ring.pg_vec) ?
2441                                                 tpacket_rcv : packet_rcv;
2442                 skb_queue_purge(rb_queue);
2443 #undef XC
2444                 if (atomic_read(&po->mapped))
2445                         pr_err("packet_mmap: vma is busy: %d\n",
2446                                atomic_read(&po->mapped));
2447         }
2448         mutex_unlock(&po->pg_vec_lock);
2449
2450         spin_lock(&po->bind_lock);
2451         if (was_running && !po->running) {
2452                 sock_hold(sk);
2453                 po->running = 1;
2454                 po->num = num;
2455                 dev_add_pack(&po->prot_hook);
2456         }
2457         spin_unlock(&po->bind_lock);
2458
2459         release_sock(sk);
2460
2461         if (pg_vec)
2462                 free_pg_vec(pg_vec, order, req->tp_block_nr);
2463 out:
2464         return err;
2465 }
2466
2467 static int packet_mmap(struct file *file, struct socket *sock,
2468                 struct vm_area_struct *vma)
2469 {
2470         struct sock *sk = sock->sk;
2471         struct packet_sock *po = pkt_sk(sk);
2472         unsigned long size, expected_size;
2473         struct packet_ring_buffer *rb;
2474         unsigned long start;
2475         int err = -EINVAL;
2476         int i;
2477
2478         if (vma->vm_pgoff)
2479                 return -EINVAL;
2480
2481         mutex_lock(&po->pg_vec_lock);
2482
2483         expected_size = 0;
2484         for (rb = &po->rx_ring; rb <= &po->tx_ring; rb++) {
2485                 if (rb->pg_vec) {
2486                         expected_size += rb->pg_vec_len
2487                                                 * rb->pg_vec_pages
2488                                                 * PAGE_SIZE;
2489                 }
2490         }
2491
2492         if (expected_size == 0)
2493                 goto out;
2494
2495         size = vma->vm_end - vma->vm_start;
2496         if (size != expected_size)
2497                 goto out;
2498
2499         start = vma->vm_start;
2500         for (rb = &po->rx_ring; rb <= &po->tx_ring; rb++) {
2501                 if (rb->pg_vec == NULL)
2502                         continue;
2503
2504                 for (i = 0; i < rb->pg_vec_len; i++) {
2505                         struct page *page = virt_to_page(rb->pg_vec[i]);
2506                         int pg_num;
2507
2508                         for (pg_num = 0; pg_num < rb->pg_vec_pages;
2509                                         pg_num++, page++) {
2510                                 err = vm_insert_page(vma, start, page);
2511                                 if (unlikely(err))
2512                                         goto out;
2513                                 start += PAGE_SIZE;
2514                         }
2515                 }
2516         }
2517
2518         atomic_inc(&po->mapped);
2519         vma->vm_ops = &packet_mmap_ops;
2520         err = 0;
2521
2522 out:
2523         mutex_unlock(&po->pg_vec_lock);
2524         return err;
2525 }
2526
2527 static const struct proto_ops packet_ops_spkt = {
2528         .family =       PF_PACKET,
2529         .owner =        THIS_MODULE,
2530         .release =      packet_release,
2531         .bind =         packet_bind_spkt,
2532         .connect =      sock_no_connect,
2533         .socketpair =   sock_no_socketpair,
2534         .accept =       sock_no_accept,
2535         .getname =      packet_getname_spkt,
2536         .poll =         datagram_poll,
2537         .ioctl =        packet_ioctl,
2538         .listen =       sock_no_listen,
2539         .shutdown =     sock_no_shutdown,
2540         .setsockopt =   sock_no_setsockopt,
2541         .getsockopt =   sock_no_getsockopt,
2542         .sendmsg =      packet_sendmsg_spkt,
2543         .recvmsg =      packet_recvmsg,
2544         .mmap =         sock_no_mmap,
2545         .sendpage =     sock_no_sendpage,
2546 };
2547
2548 static const struct proto_ops packet_ops = {
2549         .family =       PF_PACKET,
2550         .owner =        THIS_MODULE,
2551         .release =      packet_release,
2552         .bind =         packet_bind,
2553         .connect =      sock_no_connect,
2554         .socketpair =   sock_no_socketpair,
2555         .accept =       sock_no_accept,
2556         .getname =      packet_getname,
2557         .poll =         packet_poll,
2558         .ioctl =        packet_ioctl,
2559         .listen =       sock_no_listen,
2560         .shutdown =     sock_no_shutdown,
2561         .setsockopt =   packet_setsockopt,
2562         .getsockopt =   packet_getsockopt,
2563         .sendmsg =      packet_sendmsg,
2564         .recvmsg =      packet_recvmsg,
2565         .mmap =         packet_mmap,
2566         .sendpage =     sock_no_sendpage,
2567 };
2568
2569 static const struct net_proto_family packet_family_ops = {
2570         .family =       PF_PACKET,
2571         .create =       packet_create,
2572         .owner  =       THIS_MODULE,
2573 };
2574
2575 static struct notifier_block packet_netdev_notifier = {
2576         .notifier_call =        packet_notifier,
2577 };
2578
2579 #ifdef CONFIG_PROC_FS
2580
2581 static void *packet_seq_start(struct seq_file *seq, loff_t *pos)
2582         __acquires(RCU)
2583 {
2584         struct net *net = seq_file_net(seq);
2585
2586         rcu_read_lock();
2587         return seq_hlist_start_head_rcu(&net->packet.sklist, *pos);
2588 }
2589
2590 static void *packet_seq_next(struct seq_file *seq, void *v, loff_t *pos)
2591 {
2592         struct net *net = seq_file_net(seq);
2593         return seq_hlist_next_rcu(v, &net->packet.sklist, pos);
2594 }
2595
2596 static void packet_seq_stop(struct seq_file *seq, void *v)
2597         __releases(RCU)
2598 {
2599         rcu_read_unlock();
2600 }
2601
2602 static int packet_seq_show(struct seq_file *seq, void *v)
2603 {
2604         if (v == SEQ_START_TOKEN)
2605                 seq_puts(seq, "sk       RefCnt Type Proto  Iface R Rmem   User   Inode\n");
2606         else {
2607                 struct sock *s = sk_entry(v);
2608                 const struct packet_sock *po = pkt_sk(s);
2609
2610                 seq_printf(seq,
2611                            "%p %-6d %-4d %04x   %-5d %1d %-6u %-6u %-6lu\n",
2612                            s,
2613                            atomic_read(&s->sk_refcnt),
2614                            s->sk_type,
2615                            ntohs(po->num),
2616                            po->ifindex,
2617                            po->running,
2618                            atomic_read(&s->sk_rmem_alloc),
2619                            sock_i_uid(s),
2620                            sock_i_ino(s));
2621         }
2622
2623         return 0;
2624 }
2625
2626 static const struct seq_operations packet_seq_ops = {
2627         .start  = packet_seq_start,
2628         .next   = packet_seq_next,
2629         .stop   = packet_seq_stop,
2630         .show   = packet_seq_show,
2631 };
2632
2633 static int packet_seq_open(struct inode *inode, struct file *file)
2634 {
2635         return seq_open_net(inode, file, &packet_seq_ops,
2636                             sizeof(struct seq_net_private));
2637 }
2638
2639 static const struct file_operations packet_seq_fops = {
2640         .owner          = THIS_MODULE,
2641         .open           = packet_seq_open,
2642         .read           = seq_read,
2643         .llseek         = seq_lseek,
2644         .release        = seq_release_net,
2645 };
2646
2647 #endif
2648
2649 static int __net_init packet_net_init(struct net *net)
2650 {
2651         spin_lock_init(&net->packet.sklist_lock);
2652         INIT_HLIST_HEAD(&net->packet.sklist);
2653
2654         if (!proc_net_fops_create(net, "packet", 0, &packet_seq_fops))
2655                 return -ENOMEM;
2656
2657         return 0;
2658 }
2659
2660 static void __net_exit packet_net_exit(struct net *net)
2661 {
2662         proc_net_remove(net, "packet");
2663 }
2664
2665 static struct pernet_operations packet_net_ops = {
2666         .init = packet_net_init,
2667         .exit = packet_net_exit,
2668 };
2669
2670
2671 static void __exit packet_exit(void)
2672 {
2673         unregister_netdevice_notifier(&packet_netdev_notifier);
2674         unregister_pernet_subsys(&packet_net_ops);
2675         sock_unregister(PF_PACKET);
2676         proto_unregister(&packet_proto);
2677 }
2678
2679 static int __init packet_init(void)
2680 {
2681         int rc = proto_register(&packet_proto, 0);
2682
2683         if (rc != 0)
2684                 goto out;
2685
2686         sock_register(&packet_family_ops);
2687         register_pernet_subsys(&packet_net_ops);
2688         register_netdevice_notifier(&packet_netdev_notifier);
2689 out:
2690         return rc;
2691 }
2692
2693 module_init(packet_init);
2694 module_exit(packet_exit);
2695 MODULE_LICENSE("GPL");
2696 MODULE_ALIAS_NETPROTO(PF_PACKET);