aoe: for performance support larger packet payloads
authorEd Cashin <ecashin@coraid.com>
Fri, 5 Oct 2012 00:16:20 +0000 (17:16 -0700)
committerLinus Torvalds <torvalds@linux-foundation.org>
Fri, 5 Oct 2012 18:05:24 +0000 (03:05 +0900)
tAdd adds the ability to work with large packets composed of a number of
segments, using the scatter gather feature of the block layer (biovecs)
and the network layer (skb frag array).  The motivation is the performance
gained by using a packet data payload greater than a page size and by
using the network card's scatter gather feature.

Users of the out-of-tree aoe driver already had these changes, but since
early 2011, they have complained of increased memory utilization and
higher CPU utilization during heavy writes.[1] The commit below appears
related, as it disables scatter gather on non-IP protocols inside the
harmonize_features function, even when the NIC supports sg.

  commit f01a5236bd4b140198fbcc550f085e8361fd73fa
  Author: Jesse Gross <jesse@nicira.com>
  Date:   Sun Jan 9 06:23:31 2011 +0000

      net offloading: Generalize netif_get_vlan_features().

With that regression in place, transmits always linearize sg AoE packets,
but in-kernel users did not have this patch.  Before 2.6.38, though, these
changes were working to allow sg to increase performance.

1. http://www.spinics.net/lists/linux-mm/msg15184.html

Signed-off-by: Ed Cashin <ecashin@coraid.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
drivers/block/aoe/aoe.h
drivers/block/aoe/aoeblk.c
drivers/block/aoe/aoecmd.c
drivers/block/aoe/aoedev.c
drivers/block/aoe/aoenet.c

index db195abad69889e4d499bde162ff5e818601f3b2..8ca8c8a929aeb70c8aba2fbfc7451028fe517835 100644 (file)
@@ -119,6 +119,8 @@ struct frame {
        ulong bcnt;
        sector_t lba;
        struct sk_buff *skb;
+       struct bio_vec *bv;
+       ulong bv_off;
 };
 
 struct aoeif {
index 321de7b6c44228e5b7c5cfc0a410be19a54759fe..3a8f0933cc7db9e8f4f3c40f05d0ac70c2580f26 100644 (file)
@@ -254,6 +254,7 @@ aoeblk_gdalloc(void *vp)
 {
        struct aoedev *d = vp;
        struct gendisk *gd;
+       enum { KB = 1024, MB = KB * KB, READ_AHEAD = MB, };
        ulong flags;
 
        gd = alloc_disk(AOE_PARTITIONS);
@@ -279,6 +280,8 @@ aoeblk_gdalloc(void *vp)
        if (bdi_init(&d->blkq->backing_dev_info))
                goto err_blkq;
        spin_lock_irqsave(&d->lock, flags);
+       blk_queue_max_hw_sectors(d->blkq, BLK_DEF_MAX_SECTORS);
+       d->blkq->backing_dev_info.ra_pages = READ_AHEAD / PAGE_CACHE_SIZE;
        gd->major = AOE_MAJOR;
        gd->first_minor = d->sysminor * AOE_PARTITIONS;
        gd->fops = &aoe_bdops;
index 887f68f6d79a9e615beba7525cb9d0e671917c4e..9a58242290c049dbfda60b64581ea1378dd8f45e 100644 (file)
@@ -165,7 +165,8 @@ freeframe(struct aoedev *d)
                                                rf = f;
                                        continue;
                                }
-gotone:                                skb_shinfo(skb)->nr_frags = skb->data_len = 0;
+gotone:                                skb->truesize -= skb->data_len;
+                               skb_shinfo(skb)->nr_frags = skb->data_len = 0;
                                skb_trim(skb, 0);
                                d->tgt = t;
                                ifrotate(*t);
@@ -201,6 +202,24 @@ gotone:                            skb_shinfo(skb)->nr_frags = skb->data_len = 0;
        return NULL;
 }
 
+static void
+skb_fillup(struct sk_buff *skb, struct bio_vec *bv, ulong off, ulong cnt)
+{
+       int frag = 0;
+       ulong fcnt;
+loop:
+       fcnt = bv->bv_len - (off - bv->bv_offset);
+       if (fcnt > cnt)
+               fcnt = cnt;
+       skb_fill_page_desc(skb, frag++, bv->bv_page, off, fcnt);
+       cnt -= fcnt;
+       if (cnt <= 0)
+               return;
+       bv++;
+       off = bv->bv_offset;
+       goto loop;
+}
+
 static int
 aoecmd_ata_rw(struct aoedev *d)
 {
@@ -211,7 +230,7 @@ aoecmd_ata_rw(struct aoedev *d)
        struct bio_vec *bv;
        struct aoetgt *t;
        struct sk_buff *skb;
-       ulong bcnt;
+       ulong bcnt, fbcnt;
        char writebit, extbit;
 
        writebit = 0x10;
@@ -226,8 +245,28 @@ aoecmd_ata_rw(struct aoedev *d)
        bcnt = t->ifp->maxbcnt;
        if (bcnt == 0)
                bcnt = DEFAULTBCNT;
-       if (bcnt > buf->bv_resid)
-               bcnt = buf->bv_resid;
+       if (bcnt > buf->resid)
+               bcnt = buf->resid;
+       fbcnt = bcnt;
+       f->bv = buf->bv;
+       f->bv_off = f->bv->bv_offset + (f->bv->bv_len - buf->bv_resid);
+       do {
+               if (fbcnt < buf->bv_resid) {
+                       buf->bv_resid -= fbcnt;
+                       buf->resid -= fbcnt;
+                       break;
+               }
+               fbcnt -= buf->bv_resid;
+               buf->resid -= buf->bv_resid;
+               if (buf->resid == 0) {
+                       d->inprocess = NULL;
+                       break;
+               }
+               buf->bv++;
+               buf->bv_resid = buf->bv->bv_len;
+               WARN_ON(buf->bv_resid == 0);
+       } while (fbcnt);
+
        /* initialize the headers & frame */
        skb = f->skb;
        h = (struct aoe_hdr *) skb_mac_header(skb);
@@ -238,7 +277,6 @@ aoecmd_ata_rw(struct aoedev *d)
        t->nout++;
        f->waited = 0;
        f->buf = buf;
-       f->bufaddr = page_address(bv->bv_page) + buf->bv_off;
        f->bcnt = bcnt;
        f->lba = buf->sector;
 
@@ -253,10 +291,11 @@ aoecmd_ata_rw(struct aoedev *d)
                ah->lba3 |= 0xe0;       /* LBA bit + obsolete 0xa0 */
        }
        if (bio_data_dir(buf->bio) == WRITE) {
-               skb_fill_page_desc(skb, 0, bv->bv_page, buf->bv_off, bcnt);
+               skb_fillup(skb, f->bv, f->bv_off, bcnt);
                ah->aflags |= AOEAFL_WRITE;
                skb->len += bcnt;
                skb->data_len = bcnt;
+               skb->truesize += bcnt;
                t->wpkts++;
        } else {
                t->rpkts++;
@@ -267,18 +306,7 @@ aoecmd_ata_rw(struct aoedev *d)
 
        /* mark all tracking fields and load out */
        buf->nframesout += 1;
-       buf->bv_off += bcnt;
-       buf->bv_resid -= bcnt;
-       buf->resid -= bcnt;
        buf->sector += bcnt >> 9;
-       if (buf->resid == 0) {
-               d->inprocess = NULL;
-       } else if (buf->bv_resid == 0) {
-               buf->bv = ++bv;
-               buf->bv_resid = bv->bv_len;
-               WARN_ON(buf->bv_resid == 0);
-               buf->bv_off = bv->bv_offset;
-       }
 
        skb->dev = t->ifp->nd;
        skb = skb_clone(skb, GFP_ATOMIC);
@@ -365,14 +393,12 @@ resend(struct aoedev *d, struct aoetgt *t, struct frame *f)
                put_lba(ah, f->lba);
 
                n = f->bcnt;
-               if (n > DEFAULTBCNT)
-                       n = DEFAULTBCNT;
                ah->scnt = n >> 9;
                if (ah->aflags & AOEAFL_WRITE) {
-                       skb_fill_page_desc(skb, 0, virt_to_page(f->bufaddr),
-                               offset_in_page(f->bufaddr), n);
+                       skb_fillup(skb, f->bv, f->bv_off, n);
                        skb->len = sizeof *h + sizeof *ah + n;
                        skb->data_len = n;
+                       skb->truesize += n;
                }
        }
        skb->dev = t->ifp->nd;
@@ -531,20 +557,6 @@ rexmit_timer(ulong vp)
                                ejectif(t, ifp);
                                ifp = NULL;
                        }
-
-                       if (ata_scnt(skb_mac_header(f->skb)) > DEFAULTBCNT / 512
-                       && ifp && ++ifp->lostjumbo > (t->nframes << 1)
-                       && ifp->maxbcnt != DEFAULTBCNT) {
-                               printk(KERN_INFO
-                                       "aoe: e%ld.%d: "
-                                       "too many lost jumbo on "
-                                       "%s:%pm - "
-                                       "falling back to %d frames.\n",
-                                       d->aoemajor, d->aoeminor,
-                                       ifp->nd->name, t->addr,
-                                       DEFAULTBCNT);
-                               ifp->maxbcnt = 0;
-                       }
                        resend(d, t, f);
                }
 
@@ -737,6 +749,45 @@ diskstats(struct gendisk *disk, struct bio *bio, ulong duration, sector_t sector
        part_stat_unlock();
 }
 
+static void
+bvcpy(struct bio_vec *bv, ulong off, struct sk_buff *skb, ulong cnt)
+{
+       ulong fcnt;
+       char *p;
+       int soff = 0;
+loop:
+       fcnt = bv->bv_len - (off - bv->bv_offset);
+       if (fcnt > cnt)
+               fcnt = cnt;
+       p = page_address(bv->bv_page) + off;
+       skb_copy_bits(skb, soff, p, fcnt);
+       soff += fcnt;
+       cnt -= fcnt;
+       if (cnt <= 0)
+               return;
+       bv++;
+       off = bv->bv_offset;
+       goto loop;
+}
+
+static void
+fadvance(struct frame *f, ulong cnt)
+{
+       ulong fcnt;
+
+       f->lba += cnt >> 9;
+loop:
+       fcnt = f->bv->bv_len - (f->bv_off - f->bv->bv_offset);
+       if (fcnt > cnt) {
+               f->bv_off += cnt;
+               return;
+       }
+       cnt -= fcnt;
+       f->bv++;
+       f->bv_off = f->bv->bv_offset;
+       goto loop;
+}
+
 void
 aoecmd_ata_rsp(struct sk_buff *skb)
 {
@@ -754,6 +805,7 @@ aoecmd_ata_rsp(struct sk_buff *skb)
        u16 aoemajor;
 
        hin = (struct aoe_hdr *) skb_mac_header(skb);
+       skb_pull(skb, sizeof(*hin));
        aoemajor = get_unaligned_be16(&hin->major);
        d = aoedev_by_aoeaddr(aoemajor, hin->minor);
        if (d == NULL) {
@@ -791,7 +843,8 @@ aoecmd_ata_rsp(struct sk_buff *skb)
 
        calc_rttavg(d, tsince(f->tag));
 
-       ahin = (struct aoe_atahdr *) (hin+1);
+       ahin = (struct aoe_atahdr *) skb->data;
+       skb_pull(skb, sizeof(*ahin));
        hout = (struct aoe_hdr *) skb_mac_header(f->skb);
        ahout = (struct aoe_atahdr *) (hout+1);
        buf = f->buf;
@@ -810,7 +863,7 @@ aoecmd_ata_rsp(struct sk_buff *skb)
                switch (ahout->cmdstat) {
                case ATA_CMD_PIO_READ:
                case ATA_CMD_PIO_READ_EXT:
-                       if (skb->len - sizeof *hin - sizeof *ahin < n) {
+                       if (skb->len < n) {
                                printk(KERN_ERR
                                        "aoe: %s.  skb->len=%d need=%ld\n",
                                        "runt data size in read", skb->len, n);
@@ -818,7 +871,7 @@ aoecmd_ata_rsp(struct sk_buff *skb)
                                spin_unlock_irqrestore(&d->lock, flags);
                                return;
                        }
-                       memcpy(f->bufaddr, ahin+1, n);
+                       bvcpy(f->bv, f->bv_off, skb, n);
                case ATA_CMD_PIO_WRITE:
                case ATA_CMD_PIO_WRITE_EXT:
                        ifp = getif(t, skb->dev);
@@ -828,21 +881,22 @@ aoecmd_ata_rsp(struct sk_buff *skb)
                                        ifp->lostjumbo = 0;
                        }
                        if (f->bcnt -= n) {
-                               f->lba += n >> 9;
-                               f->bufaddr += n;
+                               fadvance(f, n);
                                resend(d, t, f);
                                goto xmit;
                        }
                        break;
                case ATA_CMD_ID_ATA:
-                       if (skb->len - sizeof *hin - sizeof *ahin < 512) {
+                       if (skb->len < 512) {
                                printk(KERN_INFO
                                        "aoe: runt data size in ataid.  skb->len=%d\n",
                                        skb->len);
                                spin_unlock_irqrestore(&d->lock, flags);
                                return;
                        }
-                       ataid_complete(d, t, (char *) (ahin+1));
+                       if (skb_linearize(skb))
+                               break;
+                       ataid_complete(d, t, skb->data);
                        break;
                default:
                        printk(KERN_INFO
index 6b5110a474582fb296e7abbb0076c99a466d7fc8..b2d1fd354eac7f4e883a6a142a2f82290d221f89 100644 (file)
@@ -182,6 +182,7 @@ skbfree(struct sk_buff *skb)
                        "cannot free skb -- memory leaked.");
                return;
        }
+       skb->truesize -= skb->data_len;
        skb_shinfo(skb)->nr_frags = skb->data_len = 0;
        skb_trim(skb, 0);
        dev_kfree_skb(skb);
index 4d3bc0d49df59394ea550a74c05c4ad0c84c436b..07878076e43c415ce9111dfe095e089304438426 100644 (file)
@@ -102,7 +102,9 @@ static int
 aoenet_rcv(struct sk_buff *skb, struct net_device *ifp, struct packet_type *pt, struct net_device *orig_dev)
 {
        struct aoe_hdr *h;
+       struct aoe_atahdr *ah;
        u32 n;
+       int sn;
 
        if (dev_net(ifp) != &init_net)
                goto exit;
@@ -110,13 +112,16 @@ aoenet_rcv(struct sk_buff *skb, struct net_device *ifp, struct packet_type *pt,
        skb = skb_share_check(skb, GFP_ATOMIC);
        if (skb == NULL)
                return 0;
-       if (skb_linearize(skb))
-               goto exit;
        if (!is_aoe_netif(ifp))
                goto exit;
        skb_push(skb, ETH_HLEN);        /* (1) */
-
-       h = (struct aoe_hdr *) skb_mac_header(skb);
+       sn = sizeof(*h) + sizeof(*ah);
+       if (skb->len >= sn) {
+               sn -= skb_headlen(skb);
+               if (sn > 0 && !__pskb_pull_tail(skb, sn))
+                       goto exit;
+       }
+       h = (struct aoe_hdr *) skb->data;
        n = get_unaligned_be32(&h->tag);
        if ((h->verfl & AOEFL_RSP) == 0 || (n & 1<<31))
                goto exit;