writeback: bdi write bandwidth estimation

author Wu Fengguang <fengguang.wu@intel.com>

Sun, 29 Aug 2010 17:22:30 +0000 (11:22 -0600)

committer Wu Fengguang <fengguang.wu@intel.com>

Sun, 10 Jul 2011 05:09:01 +0000 (22:09 -0700)
author Wu Fengguang <fengguang.wu@intel.com>
Sun, 29 Aug 2010 17:22:30 +0000 (11:22 -0600)
committer Wu Fengguang <fengguang.wu@intel.com>
Sun, 10 Jul 2011 05:09:01 +0000 (22:09 -0700)
diff --git a/fs/fs-writeback.c b/fs/fs-writeback.c

index 2c947da39f6ea276e47aa59037ba7ecfb7642bfb..5826992910e9b91907aad092c6c4e6df416368ae 100644 (file)
--- a/fs/fs-writeback.c
+++ b/fs/fs-writeback.c
@@ -692,6 +692,16 @@ static inline bool over_bground_thresh(void)
                 global_page_state(NR_UNSTABLE_NFS) > background_thresh);
  }
  
+/*
+ * Called under wb->list_lock. If there are multiple wb per bdi,
+ * only the flusher working on the first wb should do it.
+ */
+static void wb_update_bandwidth(struct bdi_writeback *wb,
+                               unsigned long start_time)
+{
+       __bdi_update_bandwidth(wb->bdi, start_time);
+}
+
  /*
   * Explicit flushing or periodic writeback of "old" data.
   *
@@ -710,6 +720,7 @@ static inline bool over_bground_thresh(void)
  static long wb_writeback(struct bdi_writeback *wb,
                          struct wb_writeback_work *work)
  {
+       unsigned long wb_start = jiffies;
         long nr_pages = work->nr_pages;
         unsigned long oldest_jif;
         struct inode *inode;
@@ -758,6 +769,8 @@ static long wb_writeback(struct bdi_writeback *wb,
                         progress = __writeback_inodes_wb(wb, work);
                 trace_writeback_written(wb->bdi, work);
  
+               wb_update_bandwidth(wb, wb_start);
+
                 /*
                  * Did we write something? Try for more
                  *
diff --git a/include/linux/backing-dev.h b/include/linux/backing-dev.h

index 469d56443c638dc07994ead098479b19c08e75a5..a008982e7c0817295b01a516b81df477bd2fb957 100644 (file)
--- a/include/linux/backing-dev.h
+++ b/include/linux/backing-dev.h
@@ -73,6 +73,11 @@ struct backing_dev_info {
  
         struct percpu_counter bdi_stat[NR_BDI_STAT_ITEMS];
  
+       unsigned long bw_time_stamp;    /* last time write bw is updated */
+       unsigned long written_stamp;    /* pages written at bw_time_stamp */
+       unsigned long write_bandwidth;  /* the estimated write bandwidth */
+       unsigned long avg_write_bandwidth; /* further smoothed write bw */
+
         struct prop_local_percpu completions;
         int dirty_exceeded;
  
diff --git a/include/linux/writeback.h b/include/linux/writeback.h

index df1b7f18f100e6ba4a228d11c9857fe9fc477ddd..66862f2d90c82b72dae5f90d344dd1d2728f16d9 100644 (file)
--- a/include/linux/writeback.h
+++ b/include/linux/writeback.h
@@ -118,6 +118,9 @@ void global_dirty_limits(unsigned long *pbackground, unsigned long *pdirty);
  unsigned long bdi_dirty_limit(struct backing_dev_info *bdi,
                                unsigned long dirty);
  
+void __bdi_update_bandwidth(struct backing_dev_info *bdi,
+                           unsigned long start_time);
+
  void page_writeback_init(void);
  void balance_dirty_pages_ratelimited_nr(struct address_space *mapping,
                                         unsigned long nr_pages_dirtied);
diff --git a/mm/backing-dev.c b/mm/backing-dev.c

index 83f18a1d9d101da595dd8656461feb9103c7c2af..a76cdd160277bc733e09f3a63bae40dbc2c7bb1e 100644 (file)
--- a/mm/backing-dev.c
+++ b/mm/backing-dev.c
@@ -638,6 +638,11 @@ static void bdi_wb_init(struct bdi_writeback *wb, struct backing_dev_info *bdi)
         setup_timer(&wb->wakeup_timer, wakeup_timer_fn, (unsigned long)bdi);
  }
  
+/*
+ * Initial write bandwidth: 100 MB/s
+ */
+#define INIT_BW                (100 << (20 - PAGE_SHIFT))
+
  int bdi_init(struct backing_dev_info *bdi)
  {
         int i, err;
@@ -660,6 +665,13 @@ int bdi_init(struct backing_dev_info *bdi)
         }
  
         bdi->dirty_exceeded = 0;
+
+       bdi->bw_time_stamp = jiffies;
+       bdi->written_stamp = 0;
+
+       bdi->write_bandwidth = INIT_BW;
+       bdi->avg_write_bandwidth = INIT_BW;
+
         err = prop_local_init_percpu(&bdi->completions);
  
         if (err) {
diff --git a/mm/page-writeback.c b/mm/page-writeback.c

index 8cd71376c63dc0c95869645e36423a6a08c66678..446bdf7b975b113220f1c9b1666fe4b1cf2296ee 100644 (file)
--- a/mm/page-writeback.c
+++ b/mm/page-writeback.c
@@ -36,6 +36,11 @@
  #include <linux/pagevec.h>
  #include <trace/events/writeback.h>
  
+/*
+ * Estimate write bandwidth at 200ms intervals.
+ */
+#define BANDWIDTH_INTERVAL     max(HZ/5, 1)
+
  /*
   * After a CPU has dirtied this many pages, balance_dirty_pages_ratelimited
   * will look to see if it needs to force writeback or throttling.
@@ -471,6 +476,85 @@ unsigned long bdi_dirty_limit(struct backing_dev_info *bdi, unsigned long dirty)
         return bdi_dirty;
  }
  
+static void bdi_update_write_bandwidth(struct backing_dev_info *bdi,
+                                      unsigned long elapsed,
+                                      unsigned long written)
+{
+       const unsigned long period = roundup_pow_of_two(3 * HZ);
+       unsigned long avg = bdi->avg_write_bandwidth;
+       unsigned long old = bdi->write_bandwidth;
+       u64 bw;
+
+       /*
+        * bw = written * HZ / elapsed
+        *
+        *                   bw * elapsed + write_bandwidth * (period - elapsed)
+        * write_bandwidth = ---------------------------------------------------
+        *                                          period
+        */
+       bw = written - bdi->written_stamp;
+       bw *= HZ;
+       if (unlikely(elapsed > period)) {
+               do_div(bw, elapsed);
+               avg = bw;
+               goto out;
+       }
+       bw += (u64)bdi->write_bandwidth * (period - elapsed);
+       bw >>= ilog2(period);
+
+       /*
+        * one more level of smoothing, for filtering out sudden spikes
+        */
+       if (avg > old && old >= (unsigned long)bw)
+               avg -= (avg - old) >> 3;
+
+       if (avg < old && old <= (unsigned long)bw)
+               avg += (old - avg) >> 3;
+
+out:
+       bdi->write_bandwidth = bw;
+       bdi->avg_write_bandwidth = avg;
+}
+
+void __bdi_update_bandwidth(struct backing_dev_info *bdi,
+                           unsigned long start_time)
+{
+       unsigned long now = jiffies;
+       unsigned long elapsed = now - bdi->bw_time_stamp;
+       unsigned long written;
+
+       /*
+        * rate-limit, only update once every 200ms.
+        */
+       if (elapsed < BANDWIDTH_INTERVAL)
+               return;
+
+       written = percpu_counter_read(&bdi->bdi_stat[BDI_WRITTEN]);
+
+       /*
+        * Skip quiet periods when disk bandwidth is under-utilized.
+        * (at least 1s idle time between two flusher runs)
+        */
+       if (elapsed > HZ && time_before(bdi->bw_time_stamp, start_time))
+               goto snapshot;
+
+       bdi_update_write_bandwidth(bdi, elapsed, written);
+
+snapshot:
+       bdi->written_stamp = written;
+       bdi->bw_time_stamp = now;
+}
+
+static void bdi_update_bandwidth(struct backing_dev_info *bdi,
+                                unsigned long start_time)
+{
+       if (time_is_after_eq_jiffies(bdi->bw_time_stamp + BANDWIDTH_INTERVAL))
+               return;
+       spin_lock(&bdi->wb.list_lock);
+       __bdi_update_bandwidth(bdi, start_time);
+       spin_unlock(&bdi->wb.list_lock);
+}
+
  /*
   * balance_dirty_pages() must be called by processes which are generating dirty
   * data.  It looks at the number of dirty pages in the machine and will force
@@ -490,6 +574,7 @@ static void balance_dirty_pages(struct address_space *mapping,
         unsigned long pause = 1;
         bool dirty_exceeded = false;
         struct backing_dev_info *bdi = mapping->backing_dev_info;
+       unsigned long start_time = jiffies;
  
         for (;;) {
                 nr_reclaimable = global_page_state(NR_FILE_DIRTY) +
@@ -544,6 +629,8 @@ static void balance_dirty_pages(struct address_space *mapping,
                 if (!bdi->dirty_exceeded)
                         bdi->dirty_exceeded = 1;
  
+               bdi_update_bandwidth(bdi, start_time);
+
                 /* Note: nr_reclaimable denotes nr_dirty + nr_unstable.
                  * Unstable writes are a feature of certain networked
                  * filesystems (i.e. NFS) in which data may have been
author	Wu Fengguang <fengguang.wu@intel.com>
	Sun, 29 Aug 2010 17:22:30 +0000 (11:22 -0600)
committer	Wu Fengguang <fengguang.wu@intel.com>
	Sun, 10 Jul 2011 05:09:01 +0000 (22:09 -0700)
fs/fs-writeback.c		patch \| blob \| blame \| history
include/linux/backing-dev.h		patch \| blob \| blame \| history
include/linux/writeback.h		patch \| blob \| blame \| history
mm/backing-dev.c		patch \| blob \| blame \| history
mm/page-writeback.c		patch \| blob \| blame \| history