skbuff: Add pskb_extract() helper function
authorSowmini Varadhan <sowmini.varadhan@oracle.com>
Sat, 23 Apr 2016 01:36:35 +0000 (18:36 -0700)
committerDavid S. Miller <davem@davemloft.net>
Mon, 25 Apr 2016 20:54:14 +0000 (16:54 -0400)
A pattern of skb usage seen in modules such as RDS-TCP is to
extract `to_copy' bytes from the received TCP segment, starting
at some offset `off' into a new skb `clone'. This is done in
the ->data_ready callback, where the clone skb is queued up for rx on
the PF_RDS socket, while the parent TCP segment is returned unchanged
back to the TCP engine.

The existing code uses the sequence
clone = skb_clone(..);
pskb_pull(clone, off, ..);
pskb_trim(clone, to_copy, ..);
with the intention of discarding the first `off' bytes. However,
skb_clone() + pskb_pull() implies pksb_expand_head(), which ends
up doing a redundant memcpy of bytes that will then get discarded
in __pskb_pull_tail().

To avoid this inefficiency, this commit adds pskb_extract() that
creates the clone, and memcpy's only the relevant header/frag/frag_list
to the start of `clone'. pskb_trim() is then invoked to trim clone
down to the requested to_copy bytes.

Signed-off-by: Sowmini Varadhan <sowmini.varadhan@oracle.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
include/linux/skbuff.h
net/core/skbuff.c

index da0ace389feca6f2fd2c012ec7e76f6424a8653e..a1ce63979ad8873fb8309196acbd2590e9d7cecf 100644 (file)
@@ -2986,6 +2986,8 @@ struct sk_buff *skb_vlan_untag(struct sk_buff *skb);
 int skb_ensure_writable(struct sk_buff *skb, int write_len);
 int skb_vlan_pop(struct sk_buff *skb);
 int skb_vlan_push(struct sk_buff *skb, __be16 vlan_proto, u16 vlan_tci);
+struct sk_buff *pskb_extract(struct sk_buff *skb, int off, int to_copy,
+                            gfp_t gfp);
 
 static inline int memcpy_from_msg(void *data, struct msghdr *msg, int len)
 {
index 7ff7788b01518eb9f1c14db8f188c463b64cdfe1..7a1d48983f81e68b42e8beb664db9aef00440f13 100644 (file)
@@ -4622,3 +4622,245 @@ failure:
        return NULL;
 }
 EXPORT_SYMBOL(alloc_skb_with_frags);
+
+/* carve out the first off bytes from skb when off < headlen */
+static int pskb_carve_inside_header(struct sk_buff *skb, const u32 off,
+                                   const int headlen, gfp_t gfp_mask)
+{
+       int i;
+       int size = skb_end_offset(skb);
+       int new_hlen = headlen - off;
+       u8 *data;
+       int doff = 0;
+
+       size = SKB_DATA_ALIGN(size);
+
+       if (skb_pfmemalloc(skb))
+               gfp_mask |= __GFP_MEMALLOC;
+       data = kmalloc_reserve(size +
+                              SKB_DATA_ALIGN(sizeof(struct skb_shared_info)),
+                              gfp_mask, NUMA_NO_NODE, NULL);
+       if (!data)
+               return -ENOMEM;
+
+       size = SKB_WITH_OVERHEAD(ksize(data));
+
+       /* Copy real data, and all frags */
+       skb_copy_from_linear_data_offset(skb, off, data, new_hlen);
+       skb->len -= off;
+
+       memcpy((struct skb_shared_info *)(data + size),
+              skb_shinfo(skb),
+              offsetof(struct skb_shared_info,
+                       frags[skb_shinfo(skb)->nr_frags]));
+       if (skb_cloned(skb)) {
+               /* drop the old head gracefully */
+               if (skb_orphan_frags(skb, gfp_mask)) {
+                       kfree(data);
+                       return -ENOMEM;
+               }
+               for (i = 0; i < skb_shinfo(skb)->nr_frags; i++)
+                       skb_frag_ref(skb, i);
+               if (skb_has_frag_list(skb))
+                       skb_clone_fraglist(skb);
+               skb_release_data(skb);
+       } else {
+               /* we can reuse existing recount- all we did was
+                * relocate values
+                */
+               skb_free_head(skb);
+       }
+
+       doff = (data - skb->head);
+       skb->head = data;
+       skb->data = data;
+       skb->head_frag = 0;
+#ifdef NET_SKBUFF_DATA_USES_OFFSET
+       skb->end = size;
+       doff = 0;
+#else
+       skb->end = skb->head + size;
+#endif
+       skb_set_tail_pointer(skb, skb_headlen(skb));
+       skb_headers_offset_update(skb, 0);
+       skb->cloned = 0;
+       skb->hdr_len = 0;
+       skb->nohdr = 0;
+       atomic_set(&skb_shinfo(skb)->dataref, 1);
+
+       return 0;
+}
+
+static int pskb_carve(struct sk_buff *skb, const u32 off, gfp_t gfp);
+
+/* carve out the first eat bytes from skb's frag_list. May recurse into
+ * pskb_carve()
+ */
+static int pskb_carve_frag_list(struct sk_buff *skb,
+                               struct skb_shared_info *shinfo, int eat,
+                               gfp_t gfp_mask)
+{
+       struct sk_buff *list = shinfo->frag_list;
+       struct sk_buff *clone = NULL;
+       struct sk_buff *insp = NULL;
+
+       do {
+               if (!list) {
+                       pr_err("Not enough bytes to eat. Want %d\n", eat);
+                       return -EFAULT;
+               }
+               if (list->len <= eat) {
+                       /* Eaten as whole. */
+                       eat -= list->len;
+                       list = list->next;
+                       insp = list;
+               } else {
+                       /* Eaten partially. */
+                       if (skb_shared(list)) {
+                               clone = skb_clone(list, gfp_mask);
+                               if (!clone)
+                                       return -ENOMEM;
+                               insp = list->next;
+                               list = clone;
+                       } else {
+                               /* This may be pulled without problems. */
+                               insp = list;
+                       }
+                       if (pskb_carve(list, eat, gfp_mask) < 0) {
+                               kfree_skb(clone);
+                               return -ENOMEM;
+                       }
+                       break;
+               }
+       } while (eat);
+
+       /* Free pulled out fragments. */
+       while ((list = shinfo->frag_list) != insp) {
+               shinfo->frag_list = list->next;
+               kfree_skb(list);
+       }
+       /* And insert new clone at head. */
+       if (clone) {
+               clone->next = list;
+               shinfo->frag_list = clone;
+       }
+       return 0;
+}
+
+/* carve off first len bytes from skb. Split line (off) is in the
+ * non-linear part of skb
+ */
+static int pskb_carve_inside_nonlinear(struct sk_buff *skb, const u32 off,
+                                      int pos, gfp_t gfp_mask)
+{
+       int i, k = 0;
+       int size = skb_end_offset(skb);
+       u8 *data;
+       const int nfrags = skb_shinfo(skb)->nr_frags;
+       struct skb_shared_info *shinfo;
+       int doff = 0;
+
+       size = SKB_DATA_ALIGN(size);
+
+       if (skb_pfmemalloc(skb))
+               gfp_mask |= __GFP_MEMALLOC;
+       data = kmalloc_reserve(size +
+                              SKB_DATA_ALIGN(sizeof(struct skb_shared_info)),
+                              gfp_mask, NUMA_NO_NODE, NULL);
+       if (!data)
+               return -ENOMEM;
+
+       size = SKB_WITH_OVERHEAD(ksize(data));
+
+       memcpy((struct skb_shared_info *)(data + size),
+              skb_shinfo(skb), offsetof(struct skb_shared_info,
+                                        frags[skb_shinfo(skb)->nr_frags]));
+       if (skb_orphan_frags(skb, gfp_mask)) {
+               kfree(data);
+               return -ENOMEM;
+       }
+       shinfo = (struct skb_shared_info *)(data + size);
+       for (i = 0; i < nfrags; i++) {
+               int fsize = skb_frag_size(&skb_shinfo(skb)->frags[i]);
+
+               if (pos + fsize > off) {
+                       shinfo->frags[k] = skb_shinfo(skb)->frags[i];
+
+                       if (pos < off) {
+                               /* Split frag.
+                                * We have two variants in this case:
+                                * 1. Move all the frag to the second
+                                *    part, if it is possible. F.e.
+                                *    this approach is mandatory for TUX,
+                                *    where splitting is expensive.
+                                * 2. Split is accurately. We make this.
+                                */
+                               shinfo->frags[0].page_offset += off - pos;
+                               skb_frag_size_sub(&shinfo->frags[0], off - pos);
+                       }
+                       skb_frag_ref(skb, i);
+                       k++;
+               }
+               pos += fsize;
+       }
+       shinfo->nr_frags = k;
+       if (skb_has_frag_list(skb))
+               skb_clone_fraglist(skb);
+
+       if (k == 0) {
+               /* split line is in frag list */
+               pskb_carve_frag_list(skb, shinfo, off - pos, gfp_mask);
+       }
+       skb_release_data(skb);
+
+       doff = (data - skb->head);
+       skb->head = data;
+       skb->head_frag = 0;
+       skb->data = data;
+#ifdef NET_SKBUFF_DATA_USES_OFFSET
+       skb->end = size;
+       doff = 0;
+#else
+       skb->end = skb->head + size;
+#endif
+       skb_reset_tail_pointer(skb);
+       skb_headers_offset_update(skb, 0);
+       skb->cloned   = 0;
+       skb->hdr_len  = 0;
+       skb->nohdr    = 0;
+       skb->len -= off;
+       skb->data_len = skb->len;
+       atomic_set(&skb_shinfo(skb)->dataref, 1);
+       return 0;
+}
+
+/* remove len bytes from the beginning of the skb */
+static int pskb_carve(struct sk_buff *skb, const u32 len, gfp_t gfp)
+{
+       int headlen = skb_headlen(skb);
+
+       if (len < headlen)
+               return pskb_carve_inside_header(skb, len, headlen, gfp);
+       else
+               return pskb_carve_inside_nonlinear(skb, len, headlen, gfp);
+}
+
+/* Extract to_copy bytes starting at off from skb, and return this in
+ * a new skb
+ */
+struct sk_buff *pskb_extract(struct sk_buff *skb, int off,
+                            int to_copy, gfp_t gfp)
+{
+       struct sk_buff  *clone = skb_clone(skb, gfp);
+
+       if (!clone)
+               return NULL;
+
+       if (pskb_carve(clone, off, gfp) < 0 ||
+           pskb_trim(clone, to_copy)) {
+               kfree_skb(clone);
+               return NULL;
+       }
+       return clone;
+}
+EXPORT_SYMBOL(pskb_extract);