source: npl/kernel/linux_src/linux-5.9-imq.patch @ 1ebb340

Last change on this file since 1ebb340 was 1ebb340, checked in by Edwin Eefting <edwin@datux.nl>, 4 years ago

kernel 5.10

  • Property mode set to 100644
File size: 46.3 KB
RevLine 
[1ebb340]1diff -purN linux-5.8_org/drivers/net/imq.c linux-5.8/drivers/net/imq.c
2--- linux-5.8_org/drivers/net/imq.c     1970-01-01 01:00:00.000000000 +0100
3+++ linux-5.8/drivers/net/imq.c 2020-08-18 21:52:02.073931596 +0200
4@@ -0,0 +1,957 @@
5+/*
6+ *             Pseudo-driver for the intermediate queue device.
7+ *
8+ *             This program is free software; you can redistribute it and/or
9+ *             modify it under the terms of the GNU General Public License
10+ *             as published by the Free Software Foundation; either version
11+ *             2 of the License, or (at your option) any later version.
12+ *
13+ * Authors:    Patrick McHardy, <kaber@trash.net>
14+ *
15+ *            The first version was written by Martin Devera, <devik@cdi.cz>
16+ *
17+ *                        See Credits.txt
18+ */
19+
20+#include <linux/module.h>
21+#include <linux/kernel.h>
22+#include <linux/moduleparam.h>
23+#include <linux/list.h>
24+#include <linux/skbuff.h>
25+#include <linux/netdevice.h>
26+#include <linux/etherdevice.h>
27+#include <linux/rtnetlink.h>
28+#include <linux/if_arp.h>
29+#include <linux/netfilter.h>
30+#include <linux/netfilter_ipv4.h>
31+#if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE)
32+#include <linux/netfilter_ipv6.h>
33+#endif
34+#include <linux/imq.h>
35+#include <net/pkt_sched.h>
36+#include <net/netfilter/nf_queue.h>
37+#include <net/sock.h>
38+#include <linux/ip.h>
39+#include <linux/ipv6.h>
40+#include <linux/if_vlan.h>
41+#include <linux/if_pppox.h>
42+#include <net/ip.h>
43+#include <net/ipv6.h>
44+
45+static int imq_nf_queue(struct nf_queue_entry *entry, unsigned queue_num);
46+
47+static nf_hookfn imq_nf_hook;
48+
49+static struct nf_hook_ops imq_ops[] = {
50+       {
51+       /* imq_ingress_ipv4 */
52+               .hook           = imq_nf_hook,
53+               .pf             = PF_INET,
54+               .hooknum        = NF_INET_PRE_ROUTING,
55+#if defined(CONFIG_IMQ_BEHAVIOR_BA) || defined(CONFIG_IMQ_BEHAVIOR_BB)
56+               .priority       = NF_IP_PRI_MANGLE + 1,
57+#else
58+               .priority       = NF_IP_PRI_NAT_DST + 1,
59+#endif
60+       },
61+       {
62+       /* imq_egress_ipv4 */
63+               .hook           = imq_nf_hook,
64+               .pf             = PF_INET,
65+               .hooknum        = NF_INET_POST_ROUTING,
66+#if defined(CONFIG_IMQ_BEHAVIOR_AA) || defined(CONFIG_IMQ_BEHAVIOR_BA)
67+               .priority       = NF_IP_PRI_LAST,
68+#else
69+               .priority       = NF_IP_PRI_NAT_SRC - 1,
70+#endif
71+       },
72+#if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE)
73+       {
74+       /* imq_ingress_ipv6 */
75+               .hook           = imq_nf_hook,
76+               .pf             = PF_INET6,
77+               .hooknum        = NF_INET_PRE_ROUTING,
78+#if defined(CONFIG_IMQ_BEHAVIOR_BA) || defined(CONFIG_IMQ_BEHAVIOR_BB)
79+               .priority       = NF_IP6_PRI_MANGLE + 1,
80+#else
81+               .priority       = NF_IP6_PRI_NAT_DST + 1,
82+#endif
83+       },
84+       {
85+       /* imq_egress_ipv6 */
86+               .hook           = imq_nf_hook,
87+               .pf             = PF_INET6,
88+               .hooknum        = NF_INET_POST_ROUTING,
89+#if defined(CONFIG_IMQ_BEHAVIOR_AA) || defined(CONFIG_IMQ_BEHAVIOR_BA)
90+               .priority       = NF_IP6_PRI_LAST,
91+#else
92+               .priority       = NF_IP6_PRI_NAT_SRC - 1,
93+#endif
94+       },
95+#endif
96+};
97+
98+#if defined(CONFIG_IMQ_NUM_DEVS)
99+static int numdevs = CONFIG_IMQ_NUM_DEVS;
100+#else
101+static int numdevs = IMQ_MAX_DEVS;
102+#endif
103+
104+static struct net_device *imq_devs_cache[IMQ_MAX_DEVS];
105+
106+#define IMQ_MAX_QUEUES 32
107+static int numqueues = 1;
108+static u32 imq_hashrnd;
109+static int imq_dev_accurate_stats = 1;
110+
111+static inline __be16 pppoe_proto(const struct sk_buff *skb)
112+{
113+       return *((__be16 *)(skb_mac_header(skb) + ETH_HLEN +
114+                       sizeof(struct pppoe_hdr)));
115+}
116+
117+static u16 imq_hash(struct net_device *dev, struct sk_buff *skb)
118+{
119+       unsigned int pull_len;
120+       u16 protocol = skb->protocol;
121+       u32 addr1, addr2;
122+       u32 hash, ihl = 0;
123+       union {
124+               u16 in16[2];
125+               u32 in32;
126+       } ports;
127+       u8 ip_proto;
128+
129+       pull_len = 0;
130+
131+recheck:
132+       switch (protocol) {
133+       case htons(ETH_P_8021Q): {
134+               if (unlikely(skb_pull(skb, VLAN_HLEN) == NULL))
135+                       goto other;
136+
137+               pull_len += VLAN_HLEN;
138+               skb->network_header += VLAN_HLEN;
139+
140+               protocol = vlan_eth_hdr(skb)->h_vlan_encapsulated_proto;
141+               goto recheck;
142+       }
143+
144+       case htons(ETH_P_PPP_SES): {
145+               if (unlikely(skb_pull(skb, PPPOE_SES_HLEN) == NULL))
146+                       goto other;
147+
148+               pull_len += PPPOE_SES_HLEN;
149+               skb->network_header += PPPOE_SES_HLEN;
150+
151+               protocol = pppoe_proto(skb);
152+               goto recheck;
153+       }
154+
155+       case htons(ETH_P_IP): {
156+               const struct iphdr *iph = ip_hdr(skb);
157+
158+               if (unlikely(!pskb_may_pull(skb, sizeof(struct iphdr))))
159+                       goto other;
160+
161+               addr1 = iph->daddr;
162+               addr2 = iph->saddr;
163+
164+               ip_proto = !(ip_hdr(skb)->frag_off & htons(IP_MF | IP_OFFSET)) ?
165+                                iph->protocol : 0;
166+               ihl = ip_hdrlen(skb);
167+
168+               break;
169+       }
170+#if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE)
171+       case htons(ETH_P_IPV6): {
172+               const struct ipv6hdr *iph = ipv6_hdr(skb);
173+               __be16 fo = 0;
174+
175+               if (unlikely(!pskb_may_pull(skb, sizeof(struct ipv6hdr))))
176+                       goto other;
177+
178+               addr1 = iph->daddr.s6_addr32[3];
179+               addr2 = iph->saddr.s6_addr32[3];
180+               ihl = ipv6_skip_exthdr(skb, sizeof(struct ipv6hdr), &ip_proto,
181+                                      &fo);
182+               if (unlikely(ihl < 0))
183+                       goto other;
184+
185+               break;
186+       }
187+#endif
188+       default:
189+other:
190+               if (pull_len != 0) {
191+                       skb_push(skb, pull_len);
192+                       skb->network_header -= pull_len;
193+               }
194+
195+               return (u16)(ntohs(protocol) % dev->real_num_tx_queues);
196+       }
197+
198+       if (addr1 > addr2)
199+               swap(addr1, addr2);
200+
201+       switch (ip_proto) {
202+       case IPPROTO_TCP:
203+       case IPPROTO_UDP:
204+       case IPPROTO_DCCP:
205+       case IPPROTO_ESP:
206+       case IPPROTO_AH:
207+       case IPPROTO_SCTP:
208+       case IPPROTO_UDPLITE: {
209+               if (likely(skb_copy_bits(skb, ihl, &ports.in32, 4) >= 0)) {
210+                       if (ports.in16[0] > ports.in16[1])
211+                               swap(ports.in16[0], ports.in16[1]);
212+                       break;
213+               }
214+               /* fall-through */
215+       }
216+       default:
217+               ports.in32 = 0;
218+               break;
219+       }
220+
221+       if (pull_len != 0) {
222+               skb_push(skb, pull_len);
223+               skb->network_header -= pull_len;
224+       }
225+
226+       hash = jhash_3words(addr1, addr2, ports.in32, imq_hashrnd ^ ip_proto);
227+
228+       return (u16)(((u64)hash * dev->real_num_tx_queues) >> 32);
229+}
230+
231+static inline bool sk_tx_queue_recorded(struct sock *sk)
232+{
233+       return (sk_tx_queue_get(sk) >= 0);
234+}
235+
236+static struct netdev_queue *imq_select_queue(struct net_device *dev,
237+                                               struct sk_buff *skb)
238+{
239+       u16 queue_index = 0;
240+       u32 hash;
241+
242+       if (likely(dev->real_num_tx_queues == 1))
243+               goto out;
244+
245+       /* IMQ can be receiving ingress or engress packets. */
246+
247+       /* Check first for if rx_queue is set */
248+       if (skb_rx_queue_recorded(skb)) {
249+               queue_index = skb_get_rx_queue(skb);
250+               goto out;
251+       }
252+
253+       /* Check if socket has tx_queue set */
254+       if (sk_tx_queue_recorded(skb->sk)) {
255+               queue_index = sk_tx_queue_get(skb->sk);
256+               goto out;
257+       }
258+
259+       /* Try use socket hash */
260+       if (skb->sk && skb->sk->sk_hash) {
261+               hash = skb->sk->sk_hash;
262+               queue_index =
263+                       (u16)(((u64)hash * dev->real_num_tx_queues) >> 32);
264+               goto out;
265+       }
266+
267+       /* Generate hash from packet data */
268+       queue_index = imq_hash(dev, skb);
269+
270+out:
271+       if (unlikely(queue_index >= dev->real_num_tx_queues))
272+               queue_index = (u16)((u32)queue_index % dev->real_num_tx_queues);
273+
274+       skb_set_queue_mapping(skb, queue_index);
275+       return netdev_get_tx_queue(dev, queue_index);
276+}
277+
278+static struct net_device_stats *imq_get_stats(struct net_device *dev)
279+{
280+       return &dev->stats;
281+}
282+
283+/* called for packets kfree'd in qdiscs at places other than enqueue */
284+static void imq_skb_destructor(struct sk_buff *skb)
285+{
286+       struct nf_queue_entry *entry = skb->nf_queue_entry;
287+
288+       skb->nf_queue_entry = NULL;
289+
290+       if (entry) {
291+               nf_queue_entry_free(entry);
292+               kfree(entry);
293+       }
294+
295+       skb_restore_cb(skb); /* kfree backup */
296+}
297+
298+static void imq_done_check_queue_mapping(struct sk_buff *skb,
299+                                        struct net_device *dev)
300+{
301+       unsigned int queue_index;
302+
303+       /* Don't let queue_mapping be left too large after exiting IMQ */
304+       if (likely(skb->dev != dev && skb->dev != NULL)) {
305+               queue_index = skb_get_queue_mapping(skb);
306+               if (unlikely(queue_index >= skb->dev->real_num_tx_queues)) {
307+                       queue_index = (u16)((u32)queue_index %
308+                                               skb->dev->real_num_tx_queues);
309+                       skb_set_queue_mapping(skb, queue_index);
310+               }
311+       } else {
312+               /* skb->dev was IMQ device itself or NULL, be on safe side and
313+                * just clear queue mapping.
314+                */
315+               skb_set_queue_mapping(skb, 0);
316+       }
317+}
318+
319+static netdev_tx_t imq_dev_xmit(struct sk_buff *skb, struct net_device *dev)
320+{
321+       struct nf_queue_entry *entry = skb->nf_queue_entry;
322+
323+       rcu_read_lock();
324+
325+       skb->nf_queue_entry = NULL;
326+       netif_trans_update(dev);
327+
328+       dev->stats.tx_bytes += skb->len;
329+       dev->stats.tx_packets++;
330+
331+       if (unlikely(entry == NULL)) {
332+               /* We don't know what is going on here.. packet is queued for
333+                * imq device, but (probably) not by us.
334+                *
335+                * If this packet was not send here by imq_nf_queue(), then
336+                * skb_save_cb() was not used and skb_free() should not show:
337+                *   WARNING: IMQ: kfree_skb: skb->cb_next:..
338+                * and/or
339+                *   WARNING: IMQ: kfree_skb: skb->nf_queue_entry...
340+                *
341+                * However if this message is shown, then IMQ is somehow broken
342+                * and you should report this to linuximq.net.
343+                */
344+
345+               /* imq_dev_xmit is black hole that eats all packets, report that
346+                * we eat this packet happily and increase dropped counters.
347+                */
348+
349+               dev->stats.tx_dropped++;
350+               dev_kfree_skb(skb);
351+
352+               rcu_read_unlock();
353+               return NETDEV_TX_OK;
354+       }
355+
356+       skb_restore_cb(skb); /* restore skb->cb */
357+
358+       skb->imq_flags = 0;
359+       skb->destructor = NULL;
360+
361+       imq_done_check_queue_mapping(skb, dev);
362+
363+       nf_reinject(entry, NF_ACCEPT);
364+
365+       rcu_read_unlock();
366+       return NETDEV_TX_OK;
367+}
368+
369+static struct net_device *get_imq_device_by_index(int index)
370+{
371+       struct net_device *dev = NULL;
372+       struct net *net;
373+       char buf[8];
374+
375+       /* get device by name and cache result */
376+       snprintf(buf, sizeof(buf), "imq%d", index);
377+
378+       /* Search device from all namespaces. */
379+       for_each_net(net) {
380+               dev = dev_get_by_name(net, buf);
381+               if (dev)
382+                       break;
383+       }
384+
385+       if (WARN_ON_ONCE(dev == NULL)) {
386+               /* IMQ device not found. Exotic config? */
387+               return ERR_PTR(-ENODEV);
388+       }
389+
390+       imq_devs_cache[index] = dev;
391+       dev_put(dev);
392+
393+       return dev;
394+}
395+
396+static struct nf_queue_entry *nf_queue_entry_dup(struct nf_queue_entry *e)
397+{
398+       struct nf_queue_entry *entry = kmemdup(e, e->size, GFP_ATOMIC);
399+       if (entry) {
400+               nf_queue_entry_get_refs(entry);
401+               return entry;
402+       }
403+       return NULL;
404+}
405+
406+#ifdef CONFIG_BRIDGE_NETFILTER
407+/* When called from bridge netfilter, skb->data must point to MAC header
408+ * before calling skb_gso_segment(). Else, original MAC header is lost
409+ * and segmented skbs will be sent to wrong destination.
410+ */
411+static void nf_bridge_adjust_skb_data(struct sk_buff *skb)
412+{
413+       if (skb->nf_bridge)
414+               __skb_push(skb, skb->network_header - skb->mac_header);
415+}
416+
417+static void nf_bridge_adjust_segmented_data(struct sk_buff *skb)
418+{
419+       if (skb->nf_bridge)
420+               __skb_pull(skb, skb->network_header - skb->mac_header);
421+}
422+#else
423+#define nf_bridge_adjust_skb_data(s) do {} while (0)
424+#define nf_bridge_adjust_segmented_data(s) do {} while (0)
425+#endif
426+
427+static int __imq_nf_queue(struct nf_queue_entry *entry, struct net_device *dev);
428+
429+static int __imq_nf_queue_gso(struct nf_queue_entry *entry,
430+                             struct net_device *dev, struct sk_buff *skb)
431+{
432+       int ret = -ENOMEM;
433+       struct nf_queue_entry *entry_seg;
434+
435+       nf_bridge_adjust_segmented_data(skb);
436+
437+       if (skb->next == NULL) { /* last packet, no need to copy entry */
438+               struct sk_buff *gso_skb = entry->skb;
439+               entry->skb = skb;
440+               ret = __imq_nf_queue(entry, dev);
441+               if (ret)
442+                       entry->skb = gso_skb;
443+               return ret;
444+       }
445+
446+       skb->next = NULL;
447+
448+       entry_seg = nf_queue_entry_dup(entry);
449+       if (entry_seg) {
450+               entry_seg->skb = skb;
451+               ret = __imq_nf_queue(entry_seg, dev);
452+               if (ret)
453+                       nf_queue_entry_free(entry_seg);
454+       }
455+       return ret;
456+}
457+
458+static int imq_nf_queue(struct nf_queue_entry *entry, unsigned queue_num)
459+{
460+       struct sk_buff *skb, *segs;
461+       struct net_device *dev;
462+       unsigned int queued;
463+       int index, retval, err;
464+
465+       index = entry->skb->imq_flags & IMQ_F_IFMASK;
466+       if (unlikely(index > numdevs - 1)) {
467+               if (net_ratelimit())
468+                       pr_warn("IMQ: invalid device specified, highest is %u\n",
469+                               numdevs - 1);
470+               retval = -EINVAL;
471+               goto out_no_dev;
472+       }
473+
474+       /* check for imq device by index from cache */
475+       dev = imq_devs_cache[index];
476+       if (unlikely(!dev)) {
477+               dev = get_imq_device_by_index(index);
478+               if (IS_ERR(dev)) {
479+                       retval = PTR_ERR(dev);
480+                       goto out_no_dev;
481+               }
482+       }
483+
484+       if (unlikely(!(dev->flags & IFF_UP))) {
485+               entry->skb->imq_flags = 0;
486+               retval = -ECANCELED;
487+               goto out_no_dev;
488+       }
489+
490+       /* Since 3.10.x, GSO handling moved here as result of upstream commit
491+        * a5fedd43d5f6c94c71053a66e4c3d2e35f1731a2 (netfilter: move
492+        * skb_gso_segment into nfnetlink_queue module).
493+        *
494+        * Following code replicates the gso handling from
495+        * 'net/netfilter/nfnetlink_queue_core.c':nfqnl_enqueue_packet().
496+        */
497+
498+       skb = entry->skb;
499+
500+       switch (entry->state.pf) {
501+       case NFPROTO_IPV4:
502+               skb->protocol = htons(ETH_P_IP);
503+               break;
504+       case NFPROTO_IPV6:
505+               skb->protocol = htons(ETH_P_IPV6);
506+               break;
507+       }
508+
509+       if (!skb_is_gso(entry->skb))
510+               return __imq_nf_queue(entry, dev);
511+
512+       nf_bridge_adjust_skb_data(skb);
513+       segs = skb_gso_segment(skb, 0);
514+       /* Does not use PTR_ERR to limit the number of error codes that can be
515+        * returned by nf_queue.  For instance, callers rely on -ECANCELED to
516+        * mean 'ignore this hook'.
517+        */
518+       err = -ENOBUFS;
519+       if (IS_ERR(segs))
520+               goto out_err;
521+       queued = 0;
522+       err = 0;
523+       do {
524+               struct sk_buff *nskb = segs->next;
525+               if (nskb && nskb->next)
526+                       nskb->cb_next = NULL;
527+               if (err == 0)
528+                       err = __imq_nf_queue_gso(entry, dev, segs);
529+               if (err == 0)
530+                       queued++;
531+               else
532+                       kfree_skb(segs);
533+               segs = nskb;
534+       } while (segs);
535+
536+       if (queued) {
537+               if (err) /* some segments are already queued */
538+                       nf_queue_entry_free(entry);
539+               kfree_skb(skb);
540+               return 0;
541+       }
542+
543+out_err:
544+       nf_bridge_adjust_segmented_data(skb);
545+       retval = err;
546+out_no_dev:
547+       return retval;
548+}
549+
550+static int __imq_nf_queue(struct nf_queue_entry *entry, struct net_device *dev)
551+{
552+       struct sk_buff *skb_orig, *skb, *skb_shared, *skb_popd;
553+       struct Qdisc *q;
554+       struct sk_buff *to_free = NULL;
555+       struct netdev_queue *txq;
556+       spinlock_t *root_lock;
557+       int users;
558+       int retval = -EINVAL;
559+       unsigned int orig_queue_index;
560+       bool again = false;
561+
562+       dev->last_rx = jiffies;
563+
564+       skb = entry->skb;
565+       skb_orig = NULL;
566+
567+       /* skb has owner? => make clone */
568+       if (unlikely(skb->destructor)) {
569+               skb_orig = skb;
570+               skb = skb_clone(skb, GFP_ATOMIC);
571+               if (unlikely(!skb)) {
572+                       retval = -ENOMEM;
573+                       goto out;
574+               }
575+               skb->cb_next = NULL;
576+               entry->skb = skb;
577+       }
578+
579+       dev->stats.rx_bytes += skb->len;
580+       dev->stats.rx_packets++;
581+
582+       if (!skb->dev) {
583+               /* skb->dev == NULL causes problems, try the find cause. */
584+               if (net_ratelimit()) {
585+                       dev_warn(&dev->dev,
586+                                "received packet with skb->dev == NULL\n");
587+                       dump_stack();
588+               }
589+
590+               skb->dev = dev;
591+       }
592+
593+       /* Disables softirqs for lock below */
594+       rcu_read_lock_bh();
595+
596+       /* Multi-queue selection */
597+       orig_queue_index = skb_get_queue_mapping(skb);
598+       txq = imq_select_queue(dev, skb);
599+
600+       q = rcu_dereference_bh(txq->qdisc);
601+       if (unlikely(!q->enqueue))
602+               goto packet_not_eaten_by_imq_dev;
603+
604+       skb->nf_queue_entry = entry;
605+       root_lock = qdisc_lock(q);
606+       spin_lock(root_lock);
607+
608+       users = refcount_read(&skb->users);
609+
610+       skb_shared = skb_get(skb); /* increase reference count by one */
611+
612+       /* backup skb->cb, as qdisc layer will overwrite it */
613+       skb_save_cb(skb_shared);
614+       qdisc_enqueue_root(skb_shared, q, &to_free); /* might kfree_skb */
615+       if (likely(refcount_read(&skb_shared->users) == users + 1)) {
616+               bool validate;
617+
618+               kfree_skb(skb_shared); /* decrease reference count by one */
619+
620+               skb->destructor = &imq_skb_destructor;
621+
622+               skb_popd = qdisc_dequeue_skb(q, &validate);
623+
624+               /* cloned? */
625+               if (unlikely(skb_orig))
626+                       kfree_skb(skb_orig); /* free original */
627+
628+               spin_unlock(root_lock);
629+
630+#if 0
631+               /* schedule qdisc dequeue */
632+               __netif_schedule(q);
633+#else
634+               if (likely(skb_popd)) {
635+                       /* Note that we validate skb (GSO, checksum, ...) outside of locks */
636+                       if (validate)
637+                       skb_popd = validate_xmit_skb_list(skb_popd, dev, &again);
638+
639+                       if (skb_popd) {
640+                               int dummy_ret;
641+                               int cpu = smp_processor_id(); /* ok because BHs are off */
642+
643+                               txq = skb_get_tx_queue(dev, skb_popd);
644+                               /*
645+                               IMQ device will not be frozen or stoped, and it always be successful.
646+                               So we need not check its status and return value to accelerate.
647+                               */
648+                               if (imq_dev_accurate_stats && txq->xmit_lock_owner != cpu) {
649+                                       HARD_TX_LOCK(dev, txq, cpu);
650+                                       if (!netif_xmit_frozen_or_stopped(txq)) {
651+                                               dev_hard_start_xmit(skb_popd, dev, txq, &dummy_ret);
652+                                       }
653+                                       HARD_TX_UNLOCK(dev, txq);
654+                               } else {
655+                                       if (!netif_xmit_frozen_or_stopped(txq)) {
656+                                               dev_hard_start_xmit(skb_popd, dev, txq, &dummy_ret);
657+                                       }
658+                               }
659+                       }
660+               } else {
661+                       /* No ready skb, then schedule it */
662+                       __netif_schedule(q);
663+               }
664+#endif
665+               rcu_read_unlock_bh();
666+               retval = 0;
667+               goto out;
668+       } else {
669+               skb_restore_cb(skb_shared); /* restore skb->cb */
670+               skb->nf_queue_entry = NULL;
671+               /*
672+                * qdisc dropped packet and decreased skb reference count of
673+                * skb, so we don't really want to and try refree as that would
674+                * actually destroy the skb.
675+                */
676+               spin_unlock(root_lock);
677+               goto packet_not_eaten_by_imq_dev;
678+       }
679+
680+packet_not_eaten_by_imq_dev:
681+       skb_set_queue_mapping(skb, orig_queue_index);
682+       rcu_read_unlock_bh();
683+
684+       /* cloned? restore original */
685+       if (unlikely(skb_orig)) {
686+               kfree_skb(skb);
687+               entry->skb = skb_orig;
688+       }
689+       retval = -1;
690+out:
691+       if (unlikely(to_free)) {
692+               kfree_skb_list(to_free);
693+       }
694+       return retval;
695+}
696+static unsigned int imq_nf_hook(void *priv,
697+                               struct sk_buff *skb,
698+                               const struct nf_hook_state *state)
699+{
700+       return (skb->imq_flags & IMQ_F_ENQUEUE) ? NF_IMQ_QUEUE : NF_ACCEPT;
701+}
702+
703+static int imq_close(struct net_device *dev)
704+{
705+       netif_stop_queue(dev);
706+       return 0;
707+}
708+
709+static int imq_open(struct net_device *dev)
710+{
711+       netif_start_queue(dev);
712+       return 0;
713+}
714+
715+static struct device_type imq_device_type = {
716+       .name = "imq",
717+};
718+
719+static const struct net_device_ops imq_netdev_ops = {
720+       .ndo_open               = imq_open,
721+       .ndo_stop               = imq_close,
722+       .ndo_start_xmit         = imq_dev_xmit,
723+       .ndo_get_stats          = imq_get_stats,
724+};
725+
726+static void imq_setup(struct net_device *dev)
727+{
728+       dev->netdev_ops         = &imq_netdev_ops;
729+       dev->type               = ARPHRD_VOID;
730+       dev->mtu                = 16000; /* too small? */
731+       dev->tx_queue_len       = 11000; /* too big? */
732+       dev->flags              = IFF_NOARP;
733+       dev->features           = NETIF_F_SG | NETIF_F_FRAGLIST |
734+                                 NETIF_F_GSO | NETIF_F_HW_CSUM |
735+                                 NETIF_F_HIGHDMA;
736+       dev->priv_flags         &= ~(IFF_XMIT_DST_RELEASE |
737+                                    IFF_TX_SKB_SHARING);
738+}
739+
740+static int imq_validate(struct nlattr *tb[], struct nlattr *data[],
741+                       struct netlink_ext_ack *extack)
742+{
743+       int ret = 0;
744+
745+       if (tb[IFLA_ADDRESS]) {
746+               if (nla_len(tb[IFLA_ADDRESS]) != ETH_ALEN) {
747+                       ret = -EINVAL;
748+                       goto end;
749+               }
750+               if (!is_valid_ether_addr(nla_data(tb[IFLA_ADDRESS]))) {
751+                       ret = -EADDRNOTAVAIL;
752+                       goto end;
753+               }
754+       }
755+       return 0;
756+end:
757+       pr_warn("IMQ: imq_validate failed (%d)\n", ret);
758+       return ret;
759+}
760+
761+static struct rtnl_link_ops imq_link_ops __read_mostly = {
762+       .kind           = "imq",
763+       .priv_size      = 0,
764+       .setup          = imq_setup,
765+       .validate       = imq_validate,
766+};
767+
768+static const struct nf_queue_handler imq_nfqh = {
769+       .outfn = imq_nf_queue,
770+};
771+
772+static int __net_init imq_nf_register(struct net *net)
773+{
774+       return nf_register_net_hooks(net, imq_ops,
775+                                   ARRAY_SIZE(imq_ops));
776+};
777+
778+static void __net_exit imq_nf_unregister(struct net *net)
779+{
780+       nf_unregister_net_hooks(net, imq_ops,
781+                           ARRAY_SIZE(imq_ops));
782+};
783+
784+static struct pernet_operations imq_net_ops = {
785+       .init           = imq_nf_register,
786+       .exit           = imq_nf_unregister,
787+};
788+
789+static int __net_init imq_init_hooks(void)
790+{
791+       int ret;
792+       nf_register_queue_imq_handler(&imq_nfqh);
793+
794+       ret = register_pernet_subsys(&imq_net_ops);
795+       if (ret < 0)
796+               nf_unregister_queue_imq_handler();
797+
798+       return ret;
799+}
800+
801+#ifdef CONFIG_LOCKDEP
802+       static struct lock_class_key imq_netdev_addr_lock_key;
803+
804+       static void __init imq_dev_set_lockdep_one(struct net_device *dev,
805+                                   struct netdev_queue *txq, void *arg)
806+       {
807+       /*
808+        * the IMQ transmit locks can be taken recursively,
809+        * for example with one IMQ rule for input- and one for
810+        * output network devices in iptables!
811+        * until we find a better solution ignore them.
812+        */
813+               lockdep_set_novalidate_class(&txq->_xmit_lock);
814+       }
815+
816+       static void imq_dev_set_lockdep_class(struct net_device *dev)
817+               {
818+                       lockdep_set_class_and_name(&dev->addr_list_lock,
819+                                                  &imq_netdev_addr_lock_key, "_xmit_addr_IMQ");
820+                       netdev_for_each_tx_queue(dev, imq_dev_set_lockdep_one, NULL);
821+}
822+#else
823+       static inline void imq_dev_set_lockdep_class(struct net_device *dev)
824+               {
825+               }
826+#endif
827+
828+static int __init imq_init_one(int index)
829+{
830+       struct net_device *dev;
831+       int ret;
832+
833+       dev = alloc_netdev_mq(0, "imq%d", NET_NAME_UNKNOWN, imq_setup, numqueues);
834+       if (!dev)
835+               return -ENOMEM;
836+
837+       ret = dev_alloc_name(dev, dev->name);
838+       if (ret < 0)
839+               goto fail;
840+
841+       dev->rtnl_link_ops = &imq_link_ops;
842+       SET_NETDEV_DEVTYPE(dev, &imq_device_type);
843+       ret = register_netdevice(dev);
844+       if (ret < 0)
845+               goto fail;
846+
847+       imq_dev_set_lockdep_class(dev);
848+
849+       return 0;
850+fail:
851+       free_netdev(dev);
852+       return ret;
853+}
854+
855+static int __init imq_init_devs(void)
856+{
857+       int err, i;
858+
859+       if (numdevs < 1 || numdevs > IMQ_MAX_DEVS) {
860+               pr_err("IMQ: numdevs has to be betweed 1 and %u\n",
861+                      IMQ_MAX_DEVS);
862+               return -EINVAL;
863+       }
864+
865+       if (numqueues < 1 || numqueues > IMQ_MAX_QUEUES) {
866+               pr_err("IMQ: numqueues has to be betweed 1 and %u\n",
867+                      IMQ_MAX_QUEUES);
868+               return -EINVAL;
869+       }
870+
871+       get_random_bytes(&imq_hashrnd, sizeof(imq_hashrnd));
872+
873+       rtnl_lock();
874+       err = __rtnl_link_register(&imq_link_ops);
875+
876+       for (i = 0; i < numdevs && !err; i++)
877+               err = imq_init_one(i);
878+
879+       if (err) {
880+               __rtnl_link_unregister(&imq_link_ops);
881+               memset(imq_devs_cache, 0, sizeof(imq_devs_cache));
882+       }
883+       rtnl_unlock();
884+
885+       return err;
886+}
887+
888+static int __init imq_init_module(void)
889+{
890+       int err;
891+
892+#if defined(CONFIG_IMQ_NUM_DEVS)
893+       BUILD_BUG_ON(CONFIG_IMQ_NUM_DEVS > 16);
894+       BUILD_BUG_ON(CONFIG_IMQ_NUM_DEVS < 2);
895+       BUILD_BUG_ON(CONFIG_IMQ_NUM_DEVS - 1 > IMQ_F_IFMASK);
896+#endif
897+
898+       err = imq_init_devs();
899+       if (err) {
900+               pr_err("IMQ: Error trying imq_init_devs(net)\n");
901+               return err;
902+       }
903+
904+       err = imq_init_hooks();
905+       if (err) {
906+               pr_err(KERN_ERR "IMQ: Error trying imq_init_hooks()\n");
907+               rtnl_link_unregister(&imq_link_ops);
908+               memset(imq_devs_cache, 0, sizeof(imq_devs_cache));
909+               return err;
910+       }
911+
912+       pr_info("IMQ driver loaded successfully. (numdevs = %d, numqueues = %d, imq_dev_accurate_stats = %d)\n",
913+               numdevs, numqueues, imq_dev_accurate_stats);
914+
915+#if defined(CONFIG_IMQ_BEHAVIOR_BA) || defined(CONFIG_IMQ_BEHAVIOR_BB)
916+       pr_info("\tHooking IMQ before NAT on PREROUTING.\n");
917+#else
918+       pr_info("\tHooking IMQ after NAT on PREROUTING.\n");
919+#endif
920+#if defined(CONFIG_IMQ_BEHAVIOR_AB) || defined(CONFIG_IMQ_BEHAVIOR_BB)
921+       pr_info("\tHooking IMQ before NAT on POSTROUTING.\n");
922+#else
923+       pr_info("\tHooking IMQ after NAT on POSTROUTING.\n");
924+#endif
925+
926+       return 0;
927+}
928+
929+static void __exit imq_unhook(void)
930+{
931+       unregister_pernet_subsys(&imq_net_ops);
932+       nf_unregister_queue_imq_handler();
933+}
934+
935+static void __exit imq_cleanup_devs(void)
936+{
937+       rtnl_link_unregister(&imq_link_ops);
938+       memset(imq_devs_cache, 0, sizeof(imq_devs_cache));
939+}
940+
941+static void __exit imq_exit_module(void)
942+{
943+       imq_unhook();
944+       imq_cleanup_devs();
945+       pr_info("IMQ driver unloaded successfully.\n");
946+}
947+
948+module_init(imq_init_module);
949+module_exit(imq_exit_module);
950+
951+module_param(numdevs, int, 0);
952+module_param(numqueues, int, 0);
953+module_param(imq_dev_accurate_stats, int, 0);
954+MODULE_PARM_DESC(numdevs, "number of IMQ devices (how many imq* devices will be created)");
955+MODULE_PARM_DESC(numqueues, "number of queues per IMQ device");
956+MODULE_PARM_DESC(imq_dev_accurate_stats, "Notify if need the accurate imq device stats");
957+
958+MODULE_AUTHOR("https://github.com/imq/linuximq");
959+MODULE_DESCRIPTION("Pseudo-driver for the intermediate queue device. See https://github.com/imq/linuximq/wiki for more information.");
960+MODULE_LICENSE("GPL");
961+MODULE_ALIAS_RTNL_LINK("imq");
962diff -purN linux-5.8_org/drivers/net/Kconfig linux-5.8/drivers/net/Kconfig
963--- linux-5.8_org/drivers/net/Kconfig   2020-08-02 23:21:45.000000000 +0200
964+++ linux-5.8/drivers/net/Kconfig       2020-08-12 21:08:08.643831110 +0200
965@@ -339,6 +339,125 @@ config RIONET_RX_SIZE
966        depends on RIONET
967        default "128"
968 
969+config IMQ
970+       tristate "IMQ (intermediate queueing device) support"
971+       depends on NETDEVICES && NETFILTER
972+       help
973+         The IMQ device(s) is used as placeholder for QoS queueing
974+         disciplines. Every packet entering/leaving the IP stack can be
975+         directed through the IMQ device where it's enqueued/dequeued to the
976+         attached qdisc. This allows you to treat network devices as classes
977+         and distribute bandwidth among them. Iptables is used to specify
978+         through which IMQ device, if any, packets travel.
979+
980+         More information at: https://github.com/imq/linuximq
981+
982+         To compile this driver as a module, choose M here: the module
983+         will be called imq.  If unsure, say N.
984+
985+choice
986+       prompt "IMQ behavior (PRE/POSTROUTING)"
987+       depends on IMQ
988+       default IMQ_BEHAVIOR_AB
989+       help
990+         This setting defines how IMQ behaves in respect to its
991+         hooking in PREROUTING and POSTROUTING.
992+
993+         IMQ can work in any of the following ways:
994+
995+             PREROUTING   |      POSTROUTING
996+         -----------------|-------------------
997+         #1  After NAT    |      After NAT
998+         #2  After NAT    |      Before NAT
999+         #3  Before NAT   |      After NAT
1000+         #4  Before NAT   |      Before NAT
1001+
1002+         The default behavior is to hook before NAT on PREROUTING
1003+         and after NAT on POSTROUTING (#3).
1004+
1005+         This settings are specially usefull when trying to use IMQ
1006+         to shape NATed clients.
1007+
1008+         More information can be found at: https://github.com/imq/linuximq
1009+
1010+         If not sure leave the default settings alone.
1011+
1012+config IMQ_BEHAVIOR_AA
1013+       bool "IMQ AA"
1014+       help
1015+         This setting defines how IMQ behaves in respect to its
1016+         hooking in PREROUTING and POSTROUTING.
1017+
1018+         Choosing this option will make IMQ hook like this:
1019+
1020+         PREROUTING:   After NAT
1021+         POSTROUTING:  After NAT
1022+
1023+         More information can be found at: https://github.com/imq/linuximq
1024+
1025+         If not sure leave the default settings alone.
1026+
1027+config IMQ_BEHAVIOR_AB
1028+       bool "IMQ AB"
1029+       help
1030+         This setting defines how IMQ behaves in respect to its
1031+         hooking in PREROUTING and POSTROUTING.
1032+
1033+         Choosing this option will make IMQ hook like this:
1034+
1035+         PREROUTING:   After NAT
1036+         POSTROUTING:  Before NAT
1037+
1038+         More information can be found at: https://github.com/imq/linuximq
1039+
1040+         If not sure leave the default settings alone.
1041+
1042+config IMQ_BEHAVIOR_BA
1043+       bool "IMQ BA"
1044+       help
1045+         This setting defines how IMQ behaves in respect to its
1046+         hooking in PREROUTING and POSTROUTING.
1047+
1048+         Choosing this option will make IMQ hook like this:
1049+
1050+         PREROUTING:   Before NAT
1051+         POSTROUTING:  After NAT
1052+
1053+         More information can be found at: https://github.com/imq/linuximq
1054+
1055+         If not sure leave the default settings alone.
1056+
1057+config IMQ_BEHAVIOR_BB
1058+       bool "IMQ BB"
1059+       help
1060+         This setting defines how IMQ behaves in respect to its
1061+         hooking in PREROUTING and POSTROUTING.
1062+
1063+         Choosing this option will make IMQ hook like this:
1064+
1065+         PREROUTING:   Before NAT
1066+         POSTROUTING:  Before NAT
1067+
1068+         More information can be found at: https://github.com/imq/linuximq
1069+
1070+         If not sure leave the default settings alone.
1071+
1072+endchoice
1073+
1074+config IMQ_NUM_DEVS
1075+       int "Number of IMQ devices"
1076+       range 2 16
1077+       depends on IMQ
1078+       default "16"
1079+       help
1080+         This setting defines how many IMQ devices will be created.
1081+
1082+         The default value is 16.
1083+
1084+         More information can be found at: https://github.com/imq/linuximq
1085+
1086+         If not sure leave the default settings alone.
1087+
1088 config TUN
1089        tristate "Universal TUN/TAP device driver support"
1090        depends on INET
1091diff -purN linux-5.8_org/drivers/net/Makefile linux-5.8/drivers/net/Makefile
1092--- linux-5.8_org/drivers/net/Makefile  2020-08-02 23:21:45.000000000 +0200
1093+++ linux-5.8/drivers/net/Makefile      2020-08-12 21:08:08.643831110 +0200
1094@@ -14,6 +14,7 @@ obj-$(CONFIG_WIREGUARD) += wireguard/
1095 obj-$(CONFIG_EQUALIZER) += eql.o
1096 obj-$(CONFIG_IFB) += ifb.o
1097 obj-$(CONFIG_MACSEC) += macsec.o
1098+obj-$(CONFIG_IMQ) += imq.o
1099 obj-$(CONFIG_MACVLAN) += macvlan.o
1100 obj-$(CONFIG_MACVTAP) += macvtap.o
1101 obj-$(CONFIG_MII) += mii.o
1102diff -purN linux-5.8_org/include/linux/imq.h linux-5.8/include/linux/imq.h
1103--- linux-5.8_org/include/linux/imq.h   1970-01-01 01:00:00.000000000 +0100
1104+++ linux-5.8/include/linux/imq.h       2020-08-12 21:08:08.643831110 +0200
1105@@ -0,0 +1,13 @@
1106+#ifndef _IMQ_H
1107+#define _IMQ_H
1108+
1109+/* IFMASK (16 device indexes, 0 to 15) and flag(s) fit in 5 bits */
1110+#define IMQ_F_BITS     5
1111+
1112+#define IMQ_F_IFMASK   0x0f
1113+#define IMQ_F_ENQUEUE  0x10
1114+
1115+#define IMQ_MAX_DEVS   (IMQ_F_IFMASK + 1)
1116+
1117+#endif /* _IMQ_H */
1118+
1119diff -purN linux-5.8_org/include/linux/netdevice.h linux-5.8/include/linux/netdevice.h
1120--- linux-5.8_org/include/linux/netdevice.h     2020-08-02 23:21:45.000000000 +0200
1121+++ linux-5.8/include/linux/netdevice.h 2020-08-12 21:08:08.647831063 +0200
1122@@ -1998,6 +1998,11 @@ struct net_device {
1123 /*
1124  * Cache lines mostly used on receive path (including eth_type_trans())
1125  */
1126+
1127+#if defined(CONFIG_IMQ) || defined(CONFIG_IMQ_MODULE)
1128+       unsigned long           last_rx;
1129+#endif
1130+
1131        /* Interface address info used in eth_type_trans() */
1132        unsigned char           *dev_addr;
1133 
1134@@ -4216,6 +4221,19 @@ static inline void netif_tx_unlock_bh(st
1135        }                                               \
1136 }
1137 
1138+#define HARD_TX_LOCK_BH(dev, txq) {           \
1139+    if ((dev->features & NETIF_F_LLTX) == 0) {  \
1140+        __netif_tx_lock_bh(txq);      \
1141+    }                       \
1142+}
1143+
1144+#define HARD_TX_UNLOCK_BH(dev, txq) {          \
1145+    if ((dev->features & NETIF_F_LLTX) == 0) {  \
1146+        __netif_tx_unlock_bh(txq);         \
1147+    }                       \
1148+}
1149+
1150+
1151 static inline void netif_tx_disable(struct net_device *dev)
1152 {
1153        unsigned int i;
1154diff -purN linux-5.8_org/include/linux/netfilter/xt_IMQ.h linux-5.8/include/linux/netfilter/xt_IMQ.h
1155--- linux-5.8_org/include/linux/netfilter/xt_IMQ.h      1970-01-01 01:00:00.000000000 +0100
1156+++ linux-5.8/include/linux/netfilter/xt_IMQ.h  2020-08-12 21:08:08.647831063 +0200
1157@@ -0,0 +1,9 @@
1158+#ifndef _XT_IMQ_H
1159+#define _XT_IMQ_H
1160+
1161+struct xt_imq_info {
1162+       unsigned int todev;     /* target imq device */
1163+};
1164+
1165+#endif /* _XT_IMQ_H */
1166+
1167diff -purN linux-5.8_org/include/linux/netfilter_ipv4/ipt_IMQ.h linux-5.8/include/linux/netfilter_ipv4/ipt_IMQ.h
1168--- linux-5.8_org/include/linux/netfilter_ipv4/ipt_IMQ.h        1970-01-01 01:00:00.000000000 +0100
1169+++ linux-5.8/include/linux/netfilter_ipv4/ipt_IMQ.h    2020-08-12 21:08:08.647831063 +0200
1170@@ -0,0 +1,10 @@
1171+#ifndef _IPT_IMQ_H
1172+#define _IPT_IMQ_H
1173+
1174+/* Backwards compatibility for old userspace */
1175+#include <linux/netfilter/xt_IMQ.h>
1176+
1177+#define ipt_imq_info xt_imq_info
1178+
1179+#endif /* _IPT_IMQ_H */
1180+
1181diff -purN linux-5.8_org/include/linux/netfilter_ipv6/ip6t_IMQ.h linux-5.8/include/linux/netfilter_ipv6/ip6t_IMQ.h
1182--- linux-5.8_org/include/linux/netfilter_ipv6/ip6t_IMQ.h       1970-01-01 01:00:00.000000000 +0100
1183+++ linux-5.8/include/linux/netfilter_ipv6/ip6t_IMQ.h   2020-08-12 21:08:08.647831063 +0200
1184@@ -0,0 +1,10 @@
1185+#ifndef _IP6T_IMQ_H
1186+#define _IP6T_IMQ_H
1187+
1188+/* Backwards compatibility for old userspace */
1189+#include <linux/netfilter/xt_IMQ.h>
1190+
1191+#define ip6t_imq_info xt_imq_info
1192+
1193+#endif /* _IP6T_IMQ_H */
1194+
1195diff -purN linux-5.8_org/include/linux/skbuff.h linux-5.8/include/linux/skbuff.h
1196--- linux-5.8_org/include/linux/skbuff.h        2020-08-02 23:21:45.000000000 +0200
1197+++ linux-5.8/include/linux/skbuff.h    2020-08-12 21:08:08.647831063 +0200
1198@@ -40,6 +40,9 @@
1199 #if IS_ENABLED(CONFIG_NF_CONNTRACK)
1200 #include <linux/netfilter/nf_conntrack_common.h>
1201 #endif
1202+#if defined(CONFIG_IMQ) || defined(CONFIG_IMQ_MODULE)
1203+#include <linux/imq.h>
1204+#endif
1205 
1206 /* The interface for checksum offload between the stack and networking drivers
1207  * is as follows...
1208@@ -744,6 +747,9 @@ struct sk_buff {
1209         * first. This is owned by whoever has the skb queued ATM.
1210         */
1211        char                    cb[48] __aligned(8);
1212+#if defined(CONFIG_IMQ) || defined(CONFIG_IMQ_MODULE)
1213+       void                    *cb_next;
1214+#endif
1215 
1216        union {
1217                struct {
1218@@ -756,6 +762,9 @@ struct sk_buff {
1219 #if defined(CONFIG_NF_CONNTRACK) || defined(CONFIG_NF_CONNTRACK_MODULE)
1220        unsigned long            _nfct;
1221 #endif
1222+#if defined(CONFIG_IMQ) || defined(CONFIG_IMQ_MODULE)
1223+       struct nf_queue_entry *nf_queue_entry;
1224+#endif
1225        unsigned int            len,
1226                                data_len;
1227        __u16                   mac_len,
1228@@ -845,6 +854,9 @@ struct sk_buff {
1229        __u8                    offload_fwd_mark:1;
1230        __u8                    offload_l3_fwd_mark:1;
1231 #endif
1232+#if defined(CONFIG_IMQ) || defined(CONFIG_IMQ_MODULE)
1233+       __u8                    imq_flags:IMQ_F_BITS;
1234+#endif
1235 #ifdef CONFIG_NET_CLS_ACT
1236        __u8                    tc_skip_classify:1;
1237        __u8                    tc_at_ingress:1;
1238@@ -1057,6 +1069,10 @@ void skb_tx_error(struct sk_buff *skb);
1239 void consume_skb(struct sk_buff *skb);
1240 void __consume_stateless_skb(struct sk_buff *skb);
1241 void  __kfree_skb(struct sk_buff *skb);
1242+#if defined(CONFIG_IMQ) || defined(CONFIG_IMQ_MODULE)
1243+int skb_save_cb(struct sk_buff *skb);
1244+int skb_restore_cb(struct sk_buff *skb);
1245+#endif
1246 extern struct kmem_cache *skbuff_head_cache;
1247 
1248 void kfree_skb_partial(struct sk_buff *skb, bool head_stolen);
1249@@ -4257,6 +4273,10 @@ static inline void __nf_copy(struct sk_b
1250        dst->_nfct = src->_nfct;
1251        nf_conntrack_get(skb_nfct(src));
1252 #endif
1253+#if defined(CONFIG_IMQ) || defined(CONFIG_IMQ_MODULE)
1254+       dst->imq_flags = src->imq_flags;
1255+       dst->nf_queue_entry = src->nf_queue_entry;
1256+#endif
1257 #if IS_ENABLED(CONFIG_NETFILTER_XT_TARGET_TRACE) || defined(CONFIG_NF_TABLES)
1258        if (copy)
1259                dst->nf_trace = src->nf_trace;
1260diff -purN linux-5.8_org/include/net/netfilter/nf_queue.h linux-5.8/include/net/netfilter/nf_queue.h
1261--- linux-5.8_org/include/net/netfilter/nf_queue.h      2020-08-02 23:21:45.000000000 +0200
1262+++ linux-5.8/include/net/netfilter/nf_queue.h  2020-08-22 21:09:43.290601818 +0200
1263@@ -40,6 +40,11 @@ void nf_reinject(struct nf_queue_entry *
1264 void nf_queue_entry_get_refs(struct nf_queue_entry *entry);
1265 void nf_queue_entry_free(struct nf_queue_entry *entry);
1266 
1267+#if defined(CONFIG_IMQ) || defined(CONFIG_IMQ_MODULE)
1268+void nf_register_queue_imq_handler(const struct nf_queue_handler *qh);
1269+void nf_unregister_queue_imq_handler(void);
1270+#endif
1271+
1272 static inline void init_hashrandom(u32 *jhash_initval)
1273 {
1274        while (*jhash_initval == 0)
1275diff -purN linux-5.8_org/include/net/pkt_sched.h linux-5.8/include/net/pkt_sched.h
1276--- linux-5.8_org/include/net/pkt_sched.h       2020-08-02 23:21:45.000000000 +0200
1277+++ linux-5.8/include/net/pkt_sched.h   2020-08-12 21:08:08.647831063 +0200
1278@@ -123,6 +123,8 @@ bool sch_direct_xmit(struct sk_buff *skb
1279 
1280 void __qdisc_run(struct Qdisc *q);
1281 
1282+struct sk_buff *qdisc_dequeue_skb(struct Qdisc *q, bool *validate);
1283+
1284 static inline void qdisc_run(struct Qdisc *q)
1285 {
1286        if (qdisc_run_begin(q)) {
1287diff -purN linux-5.8_org/include/net/sch_generic.h linux-5.8/include/net/sch_generic.h
1288--- linux-5.8_org/include/net/sch_generic.h     2020-08-02 23:21:45.000000000 +0200
1289+++ linux-5.8/include/net/sch_generic.h 2020-08-12 21:08:08.651831016 +0200
1290@@ -795,6 +795,13 @@ static inline int qdisc_enqueue(struct s
1291        return sch->enqueue(skb, sch, to_free);
1292 }
1293 
1294+static inline int qdisc_enqueue_root(struct sk_buff *skb, struct Qdisc *sch,
1295+                                     struct sk_buff **to_free)
1296+{
1297+    qdisc_skb_cb(skb)->pkt_len = skb->len;
1298+    return qdisc_enqueue(skb, sch, to_free) & NET_XMIT_MASK;
1299+}
1300+
1301 static inline void _bstats_update(struct gnet_stats_basic_packed *bstats,
1302                                  __u64 bytes, __u32 packets)
1303 {
1304diff -purN linux-5.8_org/include/uapi/linux/netfilter.h linux-5.8/include/uapi/linux/netfilter.h
1305--- linux-5.8_org/include/uapi/linux/netfilter.h        2020-08-02 23:21:45.000000000 +0200
1306+++ linux-5.8/include/uapi/linux/netfilter.h    2020-08-12 21:08:08.651831016 +0200
1307@@ -14,7 +14,8 @@
1308 #define NF_QUEUE 3
1309 #define NF_REPEAT 4
1310 #define NF_STOP 5      /* Deprecated, for userspace nf_queue compatibility. */
1311-#define NF_MAX_VERDICT NF_STOP
1312+#define NF_IMQ_QUEUE 6
1313+#define NF_MAX_VERDICT NF_IMQ_QUEUE
1314 
1315 /* we overload the higher bits for encoding auxiliary data such as the queue
1316  * number or errno values. Not nice, but better than additional function
1317diff -purN linux-5.8_org/net/core/dev.c linux-5.8/net/core/dev.c
1318--- linux-5.8_org/net/core/dev.c        2020-08-02 23:21:45.000000000 +0200
1319+++ linux-5.8/net/core/dev.c    2020-08-12 21:08:08.651831016 +0200
1320@@ -138,6 +138,9 @@
1321 #include <linux/hrtimer.h>
1322 #include <linux/netfilter_ingress.h>
1323 #include <linux/crash_dump.h>
1324+#if defined(CONFIG_IMQ) || defined(CONFIG_IMQ_MODULE)
1325+#include <linux/imq.h>
1326+#endif
1327 #include <linux/sctp.h>
1328 #include <net/udp_tunnel.h>
1329 #include <linux/net_namespace.h>
1330@@ -3548,6 +3551,13 @@ static int xmit_one(struct sk_buff *skb,
1331        unsigned int len;
1332        int rc;
1333 
1334+#if defined(CONFIG_IMQ) || defined(CONFIG_IMQ_MODULE)
1335+       if ((!list_empty(&ptype_all) || !list_empty(&dev->ptype_all)) &&
1336+               !(skb->imq_flags & IMQ_F_ENQUEUE))
1337+#else
1338+       if (!list_empty(&ptype_all) || !list_empty(&dev->ptype_all))
1339+#endif
1340+
1341        if (dev_nit_active(dev))
1342                dev_queue_xmit_nit(skb, dev);
1343 
1344@@ -3587,6 +3597,8 @@ out:
1345        return skb;
1346 }
1347 
1348+EXPORT_SYMBOL_GPL(dev_hard_start_xmit);
1349+
1350 static struct sk_buff *validate_xmit_vlan(struct sk_buff *skb,
1351                                          netdev_features_t features)
1352 {
1353diff -purN linux-5.8_org/net/core/skbuff.c linux-5.8/net/core/skbuff.c
1354--- linux-5.8_org/net/core/skbuff.c     2020-08-02 23:21:45.000000000 +0200
1355+++ linux-5.8/net/core/skbuff.c 2020-08-12 21:08:08.651831016 +0200
1356@@ -87,6 +87,56 @@ static struct kmem_cache *skbuff_ext_cac
1357 int sysctl_max_skb_frags __read_mostly = MAX_SKB_FRAGS;
1358 EXPORT_SYMBOL(sysctl_max_skb_frags);
1359 
1360+#if defined(CONFIG_IMQ) || defined(CONFIG_IMQ_MODULE)
1361+static struct kmem_cache *skbuff_cb_store_cache __read_mostly;
1362+
1363+/* Control buffer save/restore for IMQ devices */
1364+struct skb_cb_table {
1365+       char                    cb[48] __aligned(8);
1366+       void                    *cb_next;
1367+};
1368+
1369+int skb_save_cb(struct sk_buff *skb)
1370+{
1371+       struct skb_cb_table *next;
1372+
1373+       next = kmem_cache_alloc(skbuff_cb_store_cache, GFP_ATOMIC);
1374+       if (!next)
1375+               return -ENOMEM;
1376+
1377+       BUILD_BUG_ON(sizeof(skb->cb) != sizeof(next->cb));
1378+
1379+       memcpy(next->cb, skb->cb, sizeof(skb->cb));
1380+       next->cb_next = skb->cb_next;
1381+       skb->cb_next = next;
1382+       smp_wmb();
1383+
1384+       return 0;
1385+}
1386+EXPORT_SYMBOL(skb_save_cb);
1387+
1388+int skb_restore_cb(struct sk_buff *skb)
1389+{
1390+       struct skb_cb_table *next;
1391+
1392+       if (!skb->cb_next)
1393+               return 0;
1394+
1395+       next = skb->cb_next;
1396+
1397+       BUILD_BUG_ON(sizeof(skb->cb) != sizeof(next->cb));
1398+
1399+       memcpy(skb->cb, next->cb, sizeof(skb->cb));
1400+       skb->cb_next = next->cb_next;
1401+       smp_wmb();
1402+
1403+       kmem_cache_free(skbuff_cb_store_cache, next);
1404+
1405+       return 0;
1406+}
1407+EXPORT_SYMBOL(skb_restore_cb);
1408+#endif
1409+
1410 /**
1411  *     skb_panic - private function for out-of-line support
1412  *     @skb:   buffer
1413@@ -650,6 +700,28 @@ void skb_release_head_state(struct sk_bu
1414                WARN_ON(in_irq());
1415                skb->destructor(skb);
1416        }
1417+#if defined(CONFIG_IMQ) || defined(CONFIG_IMQ_MODULE)
1418+       /*
1419+        * This should not happen. When it does, avoid memleak by restoring
1420+        * the chain of cb-backups.
1421+        */
1422+       while (skb->cb_next != NULL) {
1423+               if (net_ratelimit())
1424+                       pr_warn("IMQ: kfree_skb: skb->cb_next: %08x\n",
1425+                               (unsigned int)(uintptr_t)skb->cb_next);
1426+
1427+               skb_restore_cb(skb);
1428+       }
1429+       /*
1430+        * This should not happen either, nf_queue_entry is nullified in
1431+        * imq_dev_xmit(). If we have non-NULL nf_queue_entry then we are
1432+        * leaking entry pointers, maybe memory. We don't know if this is
1433+        * pointer to already freed memory, or should this be freed.
1434+        * If this happens we need to add refcounting, etc for nf_queue_entry.
1435+        */
1436+       if (skb->nf_queue_entry && net_ratelimit())
1437+               pr_warn("%s\n", "IMQ: kfree_skb: skb->nf_queue_entry != NULL");
1438+#endif
1439 #if IS_ENABLED(CONFIG_NF_CONNTRACK)
1440        nf_conntrack_put(skb_nfct(skb));
1441 #endif
1442@@ -934,6 +1006,9 @@ static void __copy_skb_header(struct sk_
1443        skb_dst_copy(new, old);
1444        __skb_ext_copy(new, old);
1445        __nf_copy(new, old, false);
1446+#if defined(CONFIG_IMQ) || defined(CONFIG_IMQ_MODULE)
1447+       new->cb_next = NULL;
1448+#endif
1449 
1450        /* Note : this field could be in headers_start/headers_end section
1451         * It is not yet because we do not want to have a 16 bit hole
1452@@ -4257,6 +4332,13 @@ void __init skb_init(void)
1453                                                0,
1454                                                SLAB_HWCACHE_ALIGN|SLAB_PANIC,
1455                                                NULL);
1456+#if defined(CONFIG_IMQ) || defined(CONFIG_IMQ_MODULE)
1457+       skbuff_cb_store_cache = kmem_cache_create("skbuff_cb_store_cache",
1458+                                               sizeof(struct skb_cb_table),
1459+                                               0,
1460+                                               SLAB_HWCACHE_ALIGN|SLAB_PANIC,
1461+                                               NULL);
1462+#endif
1463        skb_extensions_init();
1464 }
1465 
1466diff -purN linux-5.8_org/net/netfilter/core.c linux-5.8/net/netfilter/core.c
1467--- linux-5.8_org/net/netfilter/core.c  2020-08-02 23:21:45.000000000 +0200
1468+++ linux-5.8/net/netfilter/core.c      2020-08-12 21:08:08.651831016 +0200
1469@@ -519,6 +519,11 @@ int nf_hook_slow(struct sk_buff *skb, st
1470                        if (ret == 0)
1471                                ret = -EPERM;
1472                        return ret;
1473+               case NF_IMQ_QUEUE:
1474+                       ret = nf_queue(skb, state, s, verdict);
1475+                       if (ret == -ECANCELED)
1476+                               continue;
1477+                       return ret;
1478                case NF_QUEUE:
1479                        ret = nf_queue(skb, state, s, verdict);
1480                        if (ret == 1)
1481diff -purN linux-5.8_org/net/netfilter/Kconfig linux-5.8/net/netfilter/Kconfig
1482--- linux-5.8_org/net/netfilter/Kconfig 2020-08-02 23:21:45.000000000 +0200
1483+++ linux-5.8/net/netfilter/Kconfig     2020-08-12 21:08:08.651831016 +0200
1484@@ -921,6 +921,18 @@ config NETFILTER_XT_TARGET_LOG
1485 
1486          To compile it as a module, choose M here.  If unsure, say N.
1487 
1488+config NETFILTER_XT_TARGET_IMQ
1489+        tristate '"IMQ" target support'
1490+       depends on NETFILTER_XTABLES
1491+       depends on IP_NF_MANGLE || IP6_NF_MANGLE
1492+       select IMQ
1493+       default m if NETFILTER_ADVANCED=n
1494+        help
1495+          This option adds a `IMQ' target which is used to specify if and
1496+          to which imq device packets should get enqueued/dequeued.
1497+
1498+          To compile it as a module, choose M here.  If unsure, say N.
1499+
1500 config NETFILTER_XT_TARGET_MARK
1501        tristate '"MARK" target support'
1502        depends on NETFILTER_ADVANCED
1503diff -purN linux-5.8_org/net/netfilter/Makefile linux-5.8/net/netfilter/Makefile
1504--- linux-5.8_org/net/netfilter/Makefile        2020-08-02 23:21:45.000000000 +0200
1505+++ linux-5.8/net/netfilter/Makefile    2020-08-12 21:08:08.655830969 +0200
1506@@ -147,6 +147,7 @@ obj-$(CONFIG_NETFILTER_XT_TARGET_CT) +=
1507 obj-$(CONFIG_NETFILTER_XT_TARGET_DSCP) += xt_DSCP.o
1508 obj-$(CONFIG_NETFILTER_XT_TARGET_HL) += xt_HL.o
1509 obj-$(CONFIG_NETFILTER_XT_TARGET_HMARK) += xt_HMARK.o
1510+obj-$(CONFIG_NETFILTER_XT_TARGET_IMQ) += xt_IMQ.o
1511 obj-$(CONFIG_NETFILTER_XT_TARGET_LED) += xt_LED.o
1512 obj-$(CONFIG_NETFILTER_XT_TARGET_LOG) += xt_LOG.o
1513 obj-$(CONFIG_NETFILTER_XT_TARGET_NETMAP) += xt_NETMAP.o
1514diff -purN linux-5.8_org/net/netfilter/nf_queue.c linux-5.8/net/netfilter/nf_queue.c
1515--- linux-5.8_org/net/netfilter/nf_queue.c      2020-08-02 23:21:45.000000000 +0200
1516+++ linux-5.8/net/netfilter/nf_queue.c  2020-08-12 21:08:08.655830969 +0200
1517@@ -29,6 +29,23 @@
1518  * receives, no matter what.
1519  */
1520 
1521+#if defined(CONFIG_IMQ) || defined(CONFIG_IMQ_MODULE)
1522+static const struct nf_queue_handler __rcu *queue_imq_handler __read_mostly;
1523+
1524+void nf_register_queue_imq_handler(const struct nf_queue_handler *qh)
1525+{
1526+       rcu_assign_pointer(queue_imq_handler, qh);
1527+}
1528+EXPORT_SYMBOL_GPL(nf_register_queue_imq_handler);
1529+
1530+void nf_unregister_queue_imq_handler(void)
1531+{
1532+       RCU_INIT_POINTER(queue_imq_handler, NULL);
1533+       synchronize_rcu();
1534+}
1535+EXPORT_SYMBOL_GPL(nf_unregister_queue_imq_handler);
1536+#endif
1537+
1538 /* return EBUSY when somebody else is registered, return EEXIST if the
1539  * same handler is registered, return 0 in case of success. */
1540 void nf_register_queue_handler(struct net *net, const struct nf_queue_handler *qh)
1541@@ -153,16 +170,28 @@ static void nf_ip6_saveroute(const struc
1542 }
1543 
1544 static int __nf_queue(struct sk_buff *skb, const struct nf_hook_state *state,
1545-                     unsigned int index, unsigned int queuenum)
1546+                     unsigned int index, unsigned int verdict)
1547 {
1548        struct nf_queue_entry *entry = NULL;
1549        const struct nf_queue_handler *qh;
1550        struct net *net = state->net;
1551        unsigned int route_key_size;
1552+       unsigned int queuetype = verdict & NF_VERDICT_MASK;
1553+       unsigned int queuenum  = verdict >> NF_VERDICT_QBITS;
1554        int status;
1555 
1556        /* QUEUE == DROP if no one is waiting, to be safe. */
1557-       qh = rcu_dereference(net->nf.queue_handler);
1558+       if (queuetype == NF_IMQ_QUEUE) {
1559+#if defined(CONFIG_IMQ) || defined(CONFIG_IMQ_MODULE)
1560+       qh = rcu_dereference(queue_imq_handler);
1561+#else
1562+       BUG();
1563+       goto err_unlock;
1564+#endif
1565+       } else {
1566+               qh = rcu_dereference(net->nf.queue_handler);
1567+       }
1568+
1569        if (!qh)
1570                return -ESRCH;
1571 
1572@@ -222,8 +251,16 @@ int nf_queue(struct sk_buff *skb, struct
1573 {
1574        int ret;
1575 
1576-       ret = __nf_queue(skb, state, index, verdict >> NF_VERDICT_QBITS);
1577+       ret = __nf_queue(skb, state, index, verdict);
1578        if (ret < 0) {
1579+
1580+#if defined(CONFIG_IMQ) || defined(CONFIG_IMQ_MODULE)
1581+       /* IMQ Bypass */
1582+       if (ret == -ECANCELED && skb->imq_flags == 0) {
1583+               return 1;
1584+       }
1585+#endif
1586+
1587                if (ret == -ESRCH &&
1588                    (verdict & NF_VERDICT_FLAG_QUEUE_BYPASS))
1589                        return 1;
1590@@ -326,6 +363,7 @@ next_hook:
1591                local_bh_enable();
1592                break;
1593        case NF_QUEUE:
1594+       case NF_IMQ_QUEUE:
1595                err = nf_queue(skb, &entry->state, i, verdict);
1596                if (err == 1)
1597                        goto next_hook;
1598diff -purN linux-5.8_org/net/netfilter/xt_IMQ.c linux-5.8/net/netfilter/xt_IMQ.c
1599--- linux-5.8_org/net/netfilter/xt_IMQ.c        1970-01-01 01:00:00.000000000 +0100
1600+++ linux-5.8/net/netfilter/xt_IMQ.c    2020-08-12 21:08:08.655830969 +0200
1601@@ -0,0 +1,72 @@
1602+/*
1603+ * This target marks packets to be enqueued to an imq device
1604+ */
1605+#include <linux/module.h>
1606+#include <linux/skbuff.h>
1607+#include <linux/netfilter/x_tables.h>
1608+#include <linux/netfilter/xt_IMQ.h>
1609+#include <linux/imq.h>
1610+
1611+static unsigned int imq_target(struct sk_buff *pskb,
1612+                               const struct xt_action_param *par)
1613+{
1614+       const struct xt_imq_info *mr = par->targinfo;
1615+
1616+       pskb->imq_flags = (mr->todev & IMQ_F_IFMASK) | IMQ_F_ENQUEUE;
1617+
1618+       return XT_CONTINUE;
1619+}
1620+
1621+static int imq_checkentry(const struct xt_tgchk_param *par)
1622+{
1623+       struct xt_imq_info *mr = par->targinfo;
1624+
1625+       if (mr->todev > IMQ_MAX_DEVS - 1) {
1626+               pr_warn("IMQ: invalid device specified, highest is %u\n",
1627+                       IMQ_MAX_DEVS - 1);
1628+               return -EINVAL;
1629+       }
1630+
1631+       return 0;
1632+}
1633+
1634+static struct xt_target xt_imq_reg[] __read_mostly = {
1635+       {
1636+               .name           = "IMQ",
1637+               .family         = AF_INET,
1638+               .checkentry     = imq_checkentry,
1639+               .target         = imq_target,
1640+               .targetsize     = sizeof(struct xt_imq_info),
1641+               .table          = "mangle",
1642+               .me             = THIS_MODULE
1643+       },
1644+       {
1645+               .name           = "IMQ",
1646+               .family         = AF_INET6,
1647+               .checkentry     = imq_checkentry,
1648+               .target         = imq_target,
1649+               .targetsize     = sizeof(struct xt_imq_info),
1650+               .table          = "mangle",
1651+               .me             = THIS_MODULE
1652+       },
1653+};
1654+
1655+static int __init imq_init(void)
1656+{
1657+       return xt_register_targets(xt_imq_reg, ARRAY_SIZE(xt_imq_reg));
1658+}
1659+
1660+static void __exit imq_fini(void)
1661+{
1662+       xt_unregister_targets(xt_imq_reg, ARRAY_SIZE(xt_imq_reg));
1663+}
1664+
1665+module_init(imq_init);
1666+module_exit(imq_fini);
1667+
1668+MODULE_AUTHOR("https://github.com/imq/linuximq");
1669+MODULE_DESCRIPTION("Pseudo-driver for the intermediate queue device. See https://github.com/imq/linuximq/wiki for more information.");
1670+MODULE_LICENSE("GPL");
1671+MODULE_ALIAS("ipt_IMQ");
1672+MODULE_ALIAS("ip6t_IMQ");
1673+
1674diff -purN linux-5.8_org/net/sched/sch_generic.c linux-5.8/net/sched/sch_generic.c
1675--- linux-5.8_org/net/sched/sch_generic.c       2020-08-02 23:21:45.000000000 +0200
1676+++ linux-5.8/net/sched/sch_generic.c   2020-08-12 21:08:08.655830969 +0200
1677@@ -273,6 +273,14 @@ trace:
1678        return skb;
1679 }
1680 
1681+struct sk_buff *qdisc_dequeue_skb(struct Qdisc *q, bool *validate)
1682+{
1683+       int packets;
1684+
1685+       return dequeue_skb(q, validate, &packets);
1686+}
1687+EXPORT_SYMBOL(qdisc_dequeue_skb);
1688+
1689 /*
1690  * Transmit possibly several skbs, and handle the return status as
1691  * required. Owning running seqcount bit guarantees that
Note: See TracBrowser for help on using the repository browser.