fs/buffer.c

   1 /*
   2  *  linux/fs/buffer.c
   3  *
   4  *  Copyright (C) 1991, 1992, 2002  Linus Torvalds
   5  */
   6
   7 /*
   8  * Start bdflush() with kernel_thread not syscall - Paul Gortmaker, 12/95
   9  *
  10  * Removed a lot of unnecessary code and simplified things now that
  11  * the buffer cache isn't our primary cache - Andrew Tridgell 12/96
  12  *
  13  * Speed up hash, lru, and free list operations.  Use gfp() for allocating
  14  * hash table, use SLAB cache for buffer heads. SMP threading.  -DaveM
  15  *
  16  * Added 32k buffer block sizes - these are required older ARM systems. - RMK
  17  *
  18  * async buffer flushing, 1999 Andrea Arcangeli <andrea@suse.de>
  19  */
  20
  21 #include <linux/config.h>
  22 #include <linux/kernel.h>
  23 #include <linux/syscalls.h>
  24 #include <linux/fs.h>
  25 #include <linux/mm.h>
  26 #include <linux/percpu.h>
  27 #include <linux/slab.h>
  28 #include <linux/smp_lock.h>
  29 #include <linux/blkdev.h>
  30 #include <linux/file.h>
  31 #include <linux/quotaops.h>
  32 #include <linux/highmem.h>
  33 #include <linux/module.h>
  34 #include <linux/writeback.h>
  35 #include <linux/hash.h>
  36 #include <linux/suspend.h>
  37 #include <linux/buffer_head.h>
  38 #include <linux/bio.h>
  39 #include <linux/notifier.h>
  40 #include <linux/cpu.h>
  41 #include <linux/bitops.h>
  42 #include <linux/mpage.h>
  43 #include <linux/bit_spinlock.h>
  44
  45 static int fsync_buffers_list(spinlock_t *lock, struct list_head *list);
  46 static void invalidate_bh_lrus(void);
  47
  48 #define BH_ENTRY(list) list_entry((list), struct buffer_head, b_assoc_buffers)
  49
  50 inline void
  51 init_buffer(struct buffer_head *bh, bh_end_io_t *handler, void *private)
  52 {
  53         bh->b_end_io = handler;
  54         bh->b_private = private;
  55 }
  56
  57 static int sync_buffer(void *word)
  58 {
  59         struct block_device *bd;
  60         struct buffer_head *bh
  61                 = container_of(word, struct buffer_head, b_state);
  62
  63         smp_mb();
  64         bd = bh->b_bdev;
  65         if (bd)
  66                 blk_run_address_space(bd->bd_inode->i_mapping);
  67         io_schedule();
  68         return 0;
  69 }
  70
  71 void fastcall __lock_buffer(struct buffer_head *bh)
  72 {
  73         wait_on_bit_lock(&bh->b_state, BH_Lock, sync_buffer,
  74                                                         TASK_UNINTERRUPTIBLE);
  75 }
  76 EXPORT_SYMBOL(__lock_buffer);
  77
  78 void fastcall unlock_buffer(struct buffer_head *bh)
  79 {
  80         clear_buffer_locked(bh);
  81         smp_mb__after_clear_bit();
  82         wake_up_bit(&bh->b_state, BH_Lock);
  83 }
  84
  85 /*
  86  * Block until a buffer comes unlocked.  This doesn't stop it
  87  * from becoming locked again - you have to lock it yourself
  88  * if you want to preserve its state.
  89  */
  90 void __wait_on_buffer(struct buffer_head * bh)
  91 {
  92         wait_on_bit(&bh->b_state, BH_Lock, sync_buffer, TASK_UNINTERRUPTIBLE);
  93 }
  94
  95 static void
  96 __clear_page_buffers(struct page *page)
  97 {
  98         ClearPagePrivate(page);
  99         set_page_private(page, 0);
 100         page_cache_release(page);
 101 }
 102
 103 static void buffer_io_error(struct buffer_head *bh)
 104 {
 105         char b[BDEVNAME_SIZE];
 106
 107         printk(KERN_ERR "Buffer I/O error on device %s, logical block %Lu\n",
 108                         bdevname(bh->b_bdev, b),
 109                         (unsigned long long)bh->b_blocknr);
 110 }
 111
 112 /*
 113  * Default synchronous end-of-IO handler..  Just mark it up-to-date and
 114  * unlock the buffer. This is what ll_rw_block uses too.
 115  */
 116 void end_buffer_read_sync(struct buffer_head *bh, int uptodate)
 117 {
 118         if (uptodate) {
 119                 set_buffer_uptodate(bh);
 120         } else {
 121                 /* This happens, due to failed READA attempts. */
 122                 clear_buffer_uptodate(bh);
 123         }
 124         unlock_buffer(bh);
 125         put_bh(bh);
 126 }
 127
 128 void end_buffer_write_sync(struct buffer_head *bh, int uptodate)
 129 {
 130         char b[BDEVNAME_SIZE];
 131
 132         if (uptodate) {
 133                 set_buffer_uptodate(bh);
 134         } else {
 135                 if (!buffer_eopnotsupp(bh) && printk_ratelimit()) {
 136                         buffer_io_error(bh);
 137                         printk(KERN_WARNING "lost page write due to "
 138                                         "I/O error on %s\n",
 139                                        bdevname(bh->b_bdev, b));
 140                 }
 141                 set_buffer_write_io_error(bh);
 142                 clear_buffer_uptodate(bh);
 143         }
 144         unlock_buffer(bh);
 145         put_bh(bh);
 146 }
 147
 148 /*
 149  * Write out and wait upon all the dirty data associated with a block
 150  * device via its mapping.  Does not take the superblock lock.
 151  */
 152 int sync_blockdev(struct block_device *bdev)
 153 {
 154         int ret = 0;
 155
 156         if (bdev)
 157                 ret = filemap_write_and_wait(bdev->bd_inode->i_mapping);
 158         return ret;
 159 }
 160 EXPORT_SYMBOL(sync_blockdev);
 161
 162 /*
 163  * Write out and wait upon all dirty data associated with this
 164  * superblock.  Filesystem data as well as the underlying block
 165  * device.  Takes the superblock lock.
 166  */
 167 int fsync_super(struct super_block *sb)
 168 {
 169         sync_inodes_sb(sb, 0);
 170         DQUOT_SYNC(sb);
 171         lock_super(sb);
 172         if (sb->s_dirt && sb->s_op->write_super)
 173                 sb->s_op->write_super(sb);
 174         unlock_super(sb);
 175         if (sb->s_op->sync_fs)
 176                 sb->s_op->sync_fs(sb, 1);
 177         sync_blockdev(sb->s_bdev);
 178         sync_inodes_sb(sb, 1);
 179
 180         return sync_blockdev(sb->s_bdev);
 181 }
 182
 183 /*
 184  * Write out and wait upon all dirty data associated with this
 185  * device.   Filesystem data as well as the underlying block
 186  * device.  Takes the superblock lock.
 187  */
 188 int fsync_bdev(struct block_device *bdev)
 189 {
 190         struct super_block *sb = get_super(bdev);
 191         if (sb) {
 192                 int res = fsync_super(sb);
 193                 drop_super(sb);
 194                 return res;
 195         }
 196         return sync_blockdev(bdev);
 197 }
 198
 199 /**
 200  * freeze_bdev  --  lock a filesystem and force it into a consistent state
 201  * @bdev:       blockdevice to lock
 202  *
 203  * This takes the block device bd_mount_sem to make sure no new mounts
 204  * happen on bdev until thaw_bdev() is called.
 205  * If a superblock is found on this device, we take the s_umount semaphore
 206  * on it to make sure nobody unmounts until the snapshot creation is done.
 207  */
 208 struct super_block *freeze_bdev(struct block_device *bdev)
 209 {
 210         struct super_block *sb;
 211
 212         down(&bdev->bd_mount_sem);
 213         sb = get_super(bdev);
 214         if (sb && !(sb->s_flags & MS_RDONLY)) {
 215                 sb->s_frozen = SB_FREEZE_WRITE;
 216                 smp_wmb();
 217
 218                 sync_inodes_sb(sb, 0);
 219                 DQUOT_SYNC(sb);
 220
 221                 lock_super(sb);
 222                 if (sb->s_dirt && sb->s_op->write_super)
 223                         sb->s_op->write_super(sb);
 224                 unlock_super(sb);
 225
 226                 if (sb->s_op->sync_fs)
 227                         sb->s_op->sync_fs(sb, 1);
 228
 229                 sync_blockdev(sb->s_bdev);
 230                 sync_inodes_sb(sb, 1);
 231
 232                 sb->s_frozen = SB_FREEZE_TRANS;
 233                 smp_wmb();
 234
 235                 sync_blockdev(sb->s_bdev);
 236
 237                 if (sb->s_op->write_super_lockfs)
 238                         sb->s_op->write_super_lockfs(sb);
 239         }
 240
 241         sync_blockdev(bdev);
 242         return sb;      /* thaw_bdev releases s->s_umount and bd_mount_sem */
 243 }
 244 EXPORT_SYMBOL(freeze_bdev);
 245
 246 /**
 247  * thaw_bdev  -- unlock filesystem
 248  * @bdev:       blockdevice to unlock
 249  * @sb:         associated superblock
 250  *
 251  * Unlocks the filesystem and marks it writeable again after freeze_bdev().
 252  */
 253 void thaw_bdev(struct block_device *bdev, struct super_block *sb)
 254 {
 255         if (sb) {
 256                 BUG_ON(sb->s_bdev != bdev);
 257
 258                 if (sb->s_op->unlockfs)
 259                         sb->s_op->unlockfs(sb);
 260                 sb->s_frozen = SB_UNFROZEN;
 261                 smp_wmb();
 262                 wake_up(&sb->s_wait_unfrozen);
 263                 drop_super(sb);
 264         }
 265
 266         up(&bdev->bd_mount_sem);
 267 }
 268 EXPORT_SYMBOL(thaw_bdev);
 269
 270 /*
 271  * sync everything.  Start out by waking pdflush, because that writes back
 272  * all queues in parallel.
 273  */
 274 static void do_sync(unsigned long wait)
 275 {
 276         wakeup_pdflush(0);
 277         sync_inodes(0);         /* All mappings, inodes and their blockdevs */
 278         DQUOT_SYNC(NULL);
 279         sync_supers();          /* Write the superblocks */
 280         sync_filesystems(0);    /* Start syncing the filesystems */
 281         sync_filesystems(wait); /* Waitingly sync the filesystems */
 282         sync_inodes(wait);      /* Mappings, inodes and blockdevs, again. */
 283         if (!wait)
 284                 printk("Emergency Sync complete\n");
 285         if (unlikely(laptop_mode))
 286                 laptop_sync_completion();
 287 }
 288
 289 asmlinkage long sys_sync(void)
 290 {
 291         do_sync(1);
 292         return 0;
 293 }
 294
 295 void emergency_sync(void)
 296 {
 297         pdflush_operation(do_sync, 0);
 298 }
 299
 300 /*
 301  * Generic function to fsync a file.
 302  *
 303  * filp may be NULL if called via the msync of a vma.
 304  */
 305
 306 int file_fsync(struct file *filp, struct dentry *dentry, int datasync)
 307 {
 308         struct inode * inode = dentry->d_inode;
 309         struct super_block * sb;
 310         int ret, err;
 311
 312         /* sync the inode to buffers */
 313         ret = write_inode_now(inode, 0);
 314
 315         /* sync the superblock to buffers */
 316         sb = inode->i_sb;
 317         lock_super(sb);
 318         if (sb->s_op->write_super)
 319                 sb->s_op->write_super(sb);
 320         unlock_super(sb);
 321
 322         /* .. finally sync the buffers to disk */
 323         err = sync_blockdev(sb->s_bdev);
 324         if (!ret)
 325                 ret = err;
 326         return ret;
 327 }
 328
 329 static long do_fsync(unsigned int fd, int datasync)
 330 {
 331         struct file * file;
 332         struct address_space *mapping;
 333         int ret, err;
 334
 335         ret = -EBADF;
 336         file = fget(fd);
 337         if (!file)
 338                 goto out;
 339
 340         ret = -EINVAL;
 341         if (!file->f_op || !file->f_op->fsync) {
 342                 /* Why?  We can still call filemap_fdatawrite */
 343                 goto out_putf;
 344         }
 345
 346         mapping = file->f_mapping;
 347
 348         current->flags |= PF_SYNCWRITE;
 349         ret = filemap_fdatawrite(mapping);
 350
 351         /*
 352          * We need to protect against concurrent writers,
 353          * which could cause livelocks in fsync_buffers_list
 354          */
 355         mutex_lock(&mapping->host->i_mutex);
 356         err = file->f_op->fsync(file, file->f_dentry, datasync);
 357         if (!ret)
 358                 ret = err;
 359         mutex_unlock(&mapping->host->i_mutex);
 360         err = filemap_fdatawait(mapping);
 361         if (!ret)
 362                 ret = err;
 363         current->flags &= ~PF_SYNCWRITE;
 364
 365 out_putf:
 366         fput(file);
 367 out:
 368         return ret;
 369 }
 370
 371 asmlinkage long sys_fsync(unsigned int fd)
 372 {
 373         return do_fsync(fd, 0);
 374 }
 375
 376 asmlinkage long sys_fdatasync(unsigned int fd)
 377 {
 378         return do_fsync(fd, 1);
 379 }
 380
 381 /*
 382  * Various filesystems appear to want __find_get_block to be non-blocking.
 383  * But it's the page lock which protects the buffers.  To get around this,
 384  * we get exclusion from try_to_free_buffers with the blockdev mapping's
 385  * private_lock.
 386  *
 387  * Hack idea: for the blockdev mapping, i_bufferlist_lock contention
 388  * may be quite high.  This code could TryLock the page, and if that
 389  * succeeds, there is no need to take private_lock. (But if
 390  * private_lock is contended then so is mapping->tree_lock).
 391  */
 392 static struct buffer_head *
 393 __find_get_block_slow(struct block_device *bdev, sector_t block)
 394 {
 395         struct inode *bd_inode = bdev->bd_inode;
 396         struct address_space *bd_mapping = bd_inode->i_mapping;
 397         struct buffer_head *ret = NULL;
 398         pgoff_t index;
 399         struct buffer_head *bh;
 400         struct buffer_head *head;
 401         struct page *page;
 402         int all_mapped = 1;
 403
 404         index = block >> (PAGE_CACHE_SHIFT - bd_inode->i_blkbits);
 405         page = find_get_page(bd_mapping, index);
 406         if (!page)
 407                 goto out;
 408
 409         spin_lock(&bd_mapping->private_lock);
 410         if (!page_has_buffers(page))
 411                 goto out_unlock;
 412         head = page_buffers(page);
 413         bh = head;
 414         do {
 415                 if (bh->b_blocknr == block) {
 416                         ret = bh;
 417                         get_bh(bh);
 418                         goto out_unlock;
 419                 }
 420                 if (!buffer_mapped(bh))
 421                         all_mapped = 0;
 422                 bh = bh->b_this_page;
 423         } while (bh != head);
 424
 425         /* we might be here because some of the buffers on this page are
 426          * not mapped.  This is due to various races between
 427          * file io on the block device and getblk.  It gets dealt with
 428          * elsewhere, don't buffer_error if we had some unmapped buffers
 429          */
 430         if (all_mapped) {
 431                 printk("__find_get_block_slow() failed. "
 432                         "block=%llu, b_blocknr=%llu\n",
 433                         (unsigned long long)block, (unsigned long long)bh->b_blocknr);
 434                 printk("b_state=0x%08lx, b_size=%u\n", bh->b_state, bh->b_size);
 435                 printk("device blocksize: %d\n", 1 << bd_inode->i_blkbits);
 436         }
 437 out_unlock:
 438         spin_unlock(&bd_mapping->private_lock);
 439         page_cache_release(page);
 440 out:
 441         return ret;
 442 }
 443
 444 /* If invalidate_buffers() will trash dirty buffers, it means some kind
 445    of fs corruption is going on. Trashing dirty data always imply losing
 446    information that was supposed to be just stored on the physical layer
 447    by the user.
 448
 449    Thus invalidate_buffers in general usage is not allwowed to trash
 450    dirty buffers. For example ioctl(FLSBLKBUF) expects dirty data to
 451    be preserved.  These buffers are simply skipped.
 452
 453    We also skip buffers which are still in use.  For example this can
 454    happen if a userspace program is reading the block device.
 455
 456    NOTE: In the case where the user removed a removable-media-disk even if
 457    there's still dirty data not synced on disk (due a bug in the device driver
 458    or due an error of the user), by not destroying the dirty buffers we could
 459    generate corruption also on the next media inserted, thus a parameter is
 460    necessary to handle this case in the most safe way possible (trying
 461    to not corrupt also the new disk inserted with the data belonging to
 462    the old now corrupted disk). Also for the ramdisk the natural thing
 463    to do in order to release the ramdisk memory is to destroy dirty buffers.
 464
 465    These are two special cases. Normal usage imply the device driver
 466    to issue a sync on the device (without waiting I/O completion) and
 467    then an invalidate_buffers call that doesn't trash dirty buffers.
 468
 469    For handling cache coherency with the blkdev pagecache the 'update' case
 470    is been introduced. It is needed to re-read from disk any pinned
 471    buffer. NOTE: re-reading from disk is destructive so we can do it only
 472    when we assume nobody is changing the buffercache under our I/O and when
 473    we think the disk contains more recent information than the buffercache.
 474    The update == 1 pass marks the buffers we need to update, the update == 2
 475    pass does the actual I/O. */
 476 void invalidate_bdev(struct block_device *bdev, int destroy_dirty_buffers)
 477 {
 478         invalidate_bh_lrus();
 479         /*
 480          * FIXME: what about destroy_dirty_buffers?
 481          * We really want to use invalidate_inode_pages2() for
 482          * that, but not until that's cleaned up.
 483          */
 484         invalidate_inode_pages(bdev->bd_inode->i_mapping);
 485 }
 486
 487 /*
 488  * Kick pdflush then try to free up some ZONE_NORMAL memory.
 489  */
 490 static void free_more_memory(void)
 491 {
 492         struct zone **zones;
 493         pg_data_t *pgdat;
 494
 495         wakeup_pdflush(1024);
 496         yield();
 497
 498         for_each_pgdat(pgdat) {
 499                 zones = pgdat->node_zonelists[gfp_zone(GFP_NOFS)].zones;
 500                 if (*zones)
 501                         try_to_free_pages(zones, GFP_NOFS);
 502         }
 503 }
 504
 505 /*
 506  * I/O completion handler for block_read_full_page() - pages
 507  * which come unlocked at the end of I/O.
 508  */
 509 static void end_buffer_async_read(struct buffer_head *bh, int uptodate)
 510 {
 511         unsigned long flags;
 512         struct buffer_head *first;
 513         struct buffer_head *tmp;
 514         struct page *page;
 515         int page_uptodate = 1;
 516
 517         BUG_ON(!buffer_async_read(bh));
 518
 519         page = bh->b_page;
 520         if (uptodate) {
 521                 set_buffer_uptodate(bh);
 522         } else {
 523                 clear_buffer_uptodate(bh);
 524                 if (printk_ratelimit())
 525                         buffer_io_error(bh);
 526                 SetPageError(page);
 527         }
 528
 529         /*
 530          * Be _very_ careful from here on. Bad things can happen if
 531          * two buffer heads end IO at almost the same time and both
 532          * decide that the page is now completely done.
 533          */
 534         first = page_buffers(page);
 535         local_irq_save(flags);
 536         bit_spin_lock(BH_Uptodate_Lock, &first->b_state);
 537         clear_buffer_async_read(bh);
 538         unlock_buffer(bh);
 539         tmp = bh;
 540         do {
 541                 if (!buffer_uptodate(tmp))
 542                         page_uptodate = 0;
 543                 if (buffer_async_read(tmp)) {
 544                         BUG_ON(!buffer_locked(tmp));
 545                         goto still_busy;
 546                 }
 547                 tmp = tmp->b_this_page;
 548         } while (tmp != bh);
 549         bit_spin_unlock(BH_Uptodate_Lock, &first->b_state);
 550         local_irq_restore(flags);
 551
 552         /*
 553          * If none of the buffers had errors and they are all
 554          * uptodate then we can set the page uptodate.
 555          */
 556         if (page_uptodate && !PageError(page))
 557                 SetPageUptodate(page);
 558         unlock_page(page);
 559         return;
 560
 561 still_busy:
 562         bit_spin_unlock(BH_Uptodate_Lock, &first->b_state);
 563         local_irq_restore(flags);
 564         return;
 565 }
 566
 567 /*
 568  * Completion handler for block_write_full_page() - pages which are unlocked
 569  * during I/O, and which have PageWriteback cleared upon I/O completion.
 570  */
 571 void end_buffer_async_write(struct buffer_head *bh, int uptodate)
 572 {
 573         char b[BDEVNAME_SIZE];
 574         unsigned long flags;
 575         struct buffer_head *first;
 576         struct buffer_head *tmp;
 577         struct page *page;
 578
 579         BUG_ON(!buffer_async_write(bh));
 580
 581         page = bh->b_page;
 582         if (uptodate) {
 583                 set_buffer_uptodate(bh);
 584         } else {
 585                 if (printk_ratelimit()) {
 586                         buffer_io_error(bh);
 587                         printk(KERN_WARNING "lost page write due to "
 588                                         "I/O error on %s\n",
 589                                bdevname(bh->b_bdev, b));
 590                 }
 591                 set_bit(AS_EIO, &page->mapping->flags);
 592                 clear_buffer_uptodate(bh);
 593                 SetPageError(page);
 594         }
 595
 596         first = page_buffers(page);
 597         local_irq_save(flags);
 598         bit_spin_lock(BH_Uptodate_Lock, &first->b_state);
 599
 600         clear_buffer_async_write(bh);
 601         unlock_buffer(bh);
 602         tmp = bh->b_this_page;
 603         while (tmp != bh) {
 604                 if (buffer_async_write(tmp)) {
 605                         BUG_ON(!buffer_locked(tmp));
 606                         goto still_busy;
 607                 }
 608                 tmp = tmp->b_this_page;
 609         }
 610         bit_spin_unlock(BH_Uptodate_Lock, &first->b_state);
 611         local_irq_restore(flags);
 612         end_page_writeback(page);
 613         return;
 614
 615 still_busy:
 616         bit_spin_unlock(BH_Uptodate_Lock, &first->b_state);
 617         local_irq_restore(flags);
 618         return;
 619 }
 620
 621 /*
 622  * If a page's buffers are under async readin (end_buffer_async_read
 623  * completion) then there is a possibility that another thread of
 624  * control could lock one of the buffers after it has completed
 625  * but while some of the other buffers have not completed.  This
 626  * locked buffer would confuse end_buffer_async_read() into not unlocking
 627  * the page.  So the absence of BH_Async_Read tells end_buffer_async_read()
 628  * that this buffer is not under async I/O.
 629  *
 630  * The page comes unlocked when it has no locked buffer_async buffers
 631  * left.
 632  *
 633  * PageLocked prevents anyone starting new async I/O reads any of
 634  * the buffers.
 635  *
 636  * PageWriteback is used to prevent simultaneous writeout of the same
 637  * page.
 638  *
 639  * PageLocked prevents anyone from starting writeback of a page which is
 640  * under read I/O (PageWriteback is only ever set against a locked page).
 641  */
 642 static void mark_buffer_async_read(struct buffer_head *bh)
 643 {
 644         bh->b_end_io = end_buffer_async_read;
 645         set_buffer_async_read(bh);
 646 }
 647
 648 void mark_buffer_async_write(struct buffer_head *bh)
 649 {
 650         bh->b_end_io = end_buffer_async_write;
 651         set_buffer_async_write(bh);
 652 }
 653 EXPORT_SYMBOL(mark_buffer_async_write);
 654
 655
 656 /*
 657  * fs/buffer.c contains helper functions for buffer-backed address space's
 658  * fsync functions.  A common requirement for buffer-based filesystems is
 659  * that certain data from the backing blockdev needs to be written out for
 660  * a successful fsync().  For example, ext2 indirect blocks need to be
 661  * written back and waited upon before fsync() returns.
 662  *
 663  * The functions mark_buffer_inode_dirty(), fsync_inode_buffers(),
 664  * inode_has_buffers() and invalidate_inode_buffers() are provided for the
 665  * management of a list of dependent buffers at ->i_mapping->private_list.
 666  *
 667  * Locking is a little subtle: try_to_free_buffers() will remove buffers
 668  * from their controlling inode's queue when they are being freed.  But
 669  * try_to_free_buffers() will be operating against the *blockdev* mapping
 670  * at the time, not against the S_ISREG file which depends on those buffers.
 671  * So the locking for private_list is via the private_lock in the address_space
 672  * which backs the buffers.  Which is different from the address_space
 673  * against which the buffers are listed.  So for a particular address_space,
 674  * mapping->private_lock does *not* protect mapping->private_list!  In fact,
 675  * mapping->private_list will always be protected by the backing blockdev's
 676  * ->private_lock.
 677  *
 678  * Which introduces a requirement: all buffers on an address_space's
 679  * ->private_list must be from the same address_space: the blockdev's.
 680  *
 681  * address_spaces which do not place buffers at ->private_list via these
 682  * utility functions are free to use private_lock and private_list for
 683  * whatever they want.  The only requirement is that list_empty(private_list)
 684  * be true at clear_inode() time.
 685  *
 686  * FIXME: clear_inode should not call invalidate_inode_buffers().  The
 687  * filesystems should do that.  invalidate_inode_buffers() should just go
 688  * BUG_ON(!list_empty).
 689  *
 690  * FIXME: mark_buffer_dirty_inode() is a data-plane operation.  It should
 691  * take an address_space, not an inode.  And it should be called
 692  * mark_buffer_dirty_fsync() to clearly define why those buffers are being
 693  * queued up.
 694  *
 695  * FIXME: mark_buffer_dirty_inode() doesn't need to add the buffer to the
 696  * list if it is already on a list.  Because if the buffer is on a list,
 697  * it *must* already be on the right one.  If not, the filesystem is being
 698  * silly.  This will save a ton of locking.  But first we have to ensure
 699  * that buffers are taken *off* the old inode's list when they are freed
 700  * (presumably in truncate).  That requires careful auditing of all
 701  * filesystems (do it inside bforget()).  It could also be done by bringing
 702  * b_inode back.
 703  */
 704
 705 /*
 706  * The buffer's backing address_space's private_lock must be held
 707  */
 708 static inline void __remove_assoc_queue(struct buffer_head *bh)
 709 {
 710         list_del_init(&bh->b_assoc_buffers);
 711 }
 712
 713 int inode_has_buffers(struct inode *inode)
 714 {
 715         return !list_empty(&inode->i_data.private_list);
 716 }
 717
 718 /*
 719  * osync is designed to support O_SYNC io.  It waits synchronously for
 720  * all already-submitted IO to complete, but does not queue any new
 721  * writes to the disk.
 722  *
 723  * To do O_SYNC writes, just queue the buffer writes with ll_rw_block as
 724  * you dirty the buffers, and then use osync_inode_buffers to wait for
 725  * completion.  Any other dirty buffers which are not yet queued for
 726  * write will not be flushed to disk by the osync.
 727  */
 728 static int osync_buffers_list(spinlock_t *lock, struct list_head *list)
 729 {
 730         struct buffer_head *bh;
 731         struct list_head *p;
 732         int err = 0;
 733
 734         spin_lock(lock);
 735 repeat:
 736         list_for_each_prev(p, list) {
 737                 bh = BH_ENTRY(p);
 738                 if (buffer_locked(bh)) {
 739                         get_bh(bh);
 740                         spin_unlock(lock);
 741                         wait_on_buffer(bh);
 742                         if (!buffer_uptodate(bh))
 743                                 err = -EIO;
 744                         brelse(bh);
 745                         spin_lock(lock);
 746                         goto repeat;
 747                 }
 748         }
 749         spin_unlock(lock);
 750         return err;
 751 }
 752
 753 /**
 754  * sync_mapping_buffers - write out and wait upon a mapping's "associated"
 755  *                        buffers
 756  * @mapping: the mapping which wants those buffers written
 757  *
 758  * Starts I/O against the buffers at mapping->private_list, and waits upon
 759  * that I/O.
 760  *
 761  * Basically, this is a convenience function for fsync().
 762  * @mapping is a file or directory which needs those buffers to be written for
 763  * a successful fsync().
 764  */
 765 int sync_mapping_buffers(struct address_space *mapping)
 766 {
 767         struct address_space *buffer_mapping = mapping->assoc_mapping;
 768
 769         if (buffer_mapping == NULL || list_empty(&mapping->private_list))
 770                 return 0;
 771
 772         return fsync_buffers_list(&buffer_mapping->private_lock,
 773                                         &mapping->private_list);
 774 }
 775 EXPORT_SYMBOL(sync_mapping_buffers);
 776
 777 /*
 778  * Called when we've recently written block `bblock', and it is known that
 779  * `bblock' was for a buffer_boundary() buffer.  This means that the block at
 780  * `bblock + 1' is probably a dirty indirect block.  Hunt it down and, if it's
 781  * dirty, schedule it for IO.  So that indirects merge nicely with their data.
 782  */
 783 void write_boundary_block(struct block_device *bdev,
 784                         sector_t bblock, unsigned blocksize)
 785 {
 786         struct buffer_head *bh = __find_get_block(bdev, bblock + 1, blocksize);
 787         if (bh) {
 788                 if (buffer_dirty(bh))
 789                         ll_rw_block(WRITE, 1, &bh);
 790                 put_bh(bh);
 791         }
 792 }
 793
 794 void mark_buffer_dirty_inode(struct buffer_head *bh, struct inode *inode)
 795 {
 796         struct address_space *mapping = inode->i_mapping;
 797         struct address_space *buffer_mapping = bh->b_page->mapping;
 798
 799         mark_buffer_dirty(bh);
 800         if (!mapping->assoc_mapping) {
 801                 mapping->assoc_mapping = buffer_mapping;
 802         } else {
 803                 if (mapping->assoc_mapping != buffer_mapping)
 804                         BUG();
 805         }
 806         if (list_empty(&bh->b_assoc_buffers)) {
 807                 spin_lock(&buffer_mapping->private_lock);
 808                 list_move_tail(&bh->b_assoc_buffers,
 809                                 &mapping->private_list);
 810                 spin_unlock(&buffer_mapping->private_lock);
 811         }
 812 }
 813 EXPORT_SYMBOL(mark_buffer_dirty_inode);
 814
 815 /*
 816  * Add a page to the dirty page list.
 817  *
 818  * It is a sad fact of life that this function is called from several places
 819  * deeply under spinlocking.  It may not sleep.
 820  *
 821  * If the page has buffers, the uptodate buffers are set dirty, to preserve
 822  * dirty-state coherency between the page and the buffers.  It the page does
 823  * not have buffers then when they are later attached they will all be set
 824  * dirty.
 825  *
 826  * The buffers are dirtied before the page is dirtied.  There's a small race
 827  * window in which a writepage caller may see the page cleanness but not the
 828  * buffer dirtiness.  That's fine.  If this code were to set the page dirty
 829  * before the buffers, a concurrent writepage caller could clear the page dirty
 830  * bit, see a bunch of clean buffers and we'd end up with dirty buffers/clean
 831  * page on the dirty page list.
 832  *
 833  * We use private_lock to lock against try_to_free_buffers while using the
 834  * page's buffer list.  Also use this to protect against clean buffers being
 835  * added to the page after it was set dirty.
 836  *
 837  * FIXME: may need to call ->reservepage here as well.  That's rather up to the
 838  * address_space though.
 839  */
 840 int __set_page_dirty_buffers(struct page *page)
 841 {
 842         struct address_space * const mapping = page->mapping;
 843
 844         spin_lock(&mapping->private_lock);
 845         if (page_has_buffers(page)) {
 846                 struct buffer_head *head = page_buffers(page);
 847                 struct buffer_head *bh = head;
 848
 849                 do {
 850                         set_buffer_dirty(bh);
 851                         bh = bh->b_this_page;
 852                 } while (bh != head);
 853         }
 854         spin_unlock(&mapping->private_lock);
 855
 856         if (!TestSetPageDirty(page)) {
 857                 write_lock_irq(&mapping->tree_lock);
 858                 if (page->mapping) {    /* Race with truncate? */
 859                         if (mapping_cap_account_dirty(mapping))
 860                                 inc_page_state(nr_dirty);
 861                         radix_tree_tag_set(&mapping->page_tree,
 862                                                 page_index(page),
 863                                                 PAGECACHE_TAG_DIRTY);
 864                 }
 865                 write_unlock_irq(&mapping->tree_lock);
 866                 __mark_inode_dirty(mapping->host, I_DIRTY_PAGES);
 867         }
 868
 869         return 0;
 870 }
 871 EXPORT_SYMBOL(__set_page_dirty_buffers);
 872
 873 /*
 874  * Write out and wait upon a list of buffers.
 875  *
 876  * We have conflicting pressures: we want to make sure that all
 877  * initially dirty buffers get waited on, but that any subsequently
 878  * dirtied buffers don't.  After all, we don't want fsync to last
 879  * forever if somebody is actively writing to the file.
 880  *
 881  * Do this in two main stages: first we copy dirty buffers to a
 882  * temporary inode list, queueing the writes as we go.  Then we clean
 883  * up, waiting for those writes to complete.
 884  *
 885  * During this second stage, any subsequent updates to the file may end
 886  * up refiling the buffer on the original inode's dirty list again, so
 887  * there is a chance we will end up with a buffer queued for write but
 888  * not yet completed on that list.  So, as a final cleanup we go through
 889  * the osync code to catch these locked, dirty buffers without requeuing
 890  * any newly dirty buffers for write.
 891  */
 892 static int fsync_buffers_list(spinlock_t *lock, struct list_head *list)
 893 {
 894         struct buffer_head *bh;
 895         struct list_head tmp;
 896         int err = 0, err2;
 897
 898         INIT_LIST_HEAD(&tmp);
 899
 900         spin_lock(lock);
 901         while (!list_empty(list)) {
 902                 bh = BH_ENTRY(list->next);
 903                 list_del_init(&bh->b_assoc_buffers);
 904                 if (buffer_dirty(bh) || buffer_locked(bh)) {
 905                         list_add(&bh->b_assoc_buffers, &tmp);
 906                         if (buffer_dirty(bh)) {
 907                                 get_bh(bh);
 908                                 spin_unlock(lock);
 909                                 /*
 910                                  * Ensure any pending I/O completes so that
 911                                  * ll_rw_block() actually writes the current
 912                                  * contents - it is a noop if I/O is still in
 913                                  * flight on potentially older contents.
 914                                  */
 915                                 ll_rw_block(SWRITE, 1, &bh);
 916                                 brelse(bh);
 917                                 spin_lock(lock);
 918                         }
 919                 }
 920         }
 921
 922         while (!list_empty(&tmp)) {
 923                 bh = BH_ENTRY(tmp.prev);
 924                 __remove_assoc_queue(bh);
 925                 get_bh(bh);
 926                 spin_unlock(lock);
 927                 wait_on_buffer(bh);
 928                 if (!buffer_uptodate(bh))
 929                         err = -EIO;
 930                 brelse(bh);
 931                 spin_lock(lock);
 932         }
 933
 934         spin_unlock(lock);
 935         err2 = osync_buffers_list(lock, list);
 936         if (err)
 937                 return err;
 938         else
 939                 return err2;
 940 }
 941
 942 /*
 943  * Invalidate any and all dirty buffers on a given inode.  We are
 944  * probably unmounting the fs, but that doesn't mean we have already
 945  * done a sync().  Just drop the buffers from the inode list.
 946  *
 947  * NOTE: we take the inode's blockdev's mapping's private_lock.  Which
 948  * assumes that all the buffers are against the blockdev.  Not true
 949  * for reiserfs.
 950  */
 951 void invalidate_inode_buffers(struct inode *inode)
 952 {
 953         if (inode_has_buffers(inode)) {
 954                 struct address_space *mapping = &inode->i_data;
 955                 struct list_head *list = &mapping->private_list;
 956                 struct address_space *buffer_mapping = mapping->assoc_mapping;
 957
 958                 spin_lock(&buffer_mapping->private_lock);
 959                 while (!list_empty(list))
 960                         __remove_assoc_queue(BH_ENTRY(list->next));
 961                 spin_unlock(&buffer_mapping->private_lock);
 962         }
 963 }
 964
 965 /*
 966  * Remove any clean buffers from the inode's buffer list.  This is called
 967  * when we're trying to free the inode itself.  Those buffers can pin it.
 968  *
 969  * Returns true if all buffers were removed.
 970  */
 971 int remove_inode_buffers(struct inode *inode)
 972 {
 973         int ret = 1;
 974
 975         if (inode_has_buffers(inode)) {
 976                 struct address_space *mapping = &inode->i_data;
 977                 struct list_head *list = &mapping->private_list;
 978                 struct address_space *buffer_mapping = mapping->assoc_mapping;
 979
 980                 spin_lock(&buffer_mapping->private_lock);
 981                 while (!list_empty(list)) {
 982                         struct buffer_head *bh = BH_ENTRY(list->next);
 983                         if (buffer_dirty(bh)) {
 984                                 ret = 0;
 985                                 break;
 986                         }
 987                         __remove_assoc_queue(bh);
 988                 }
 989                 spin_unlock(&buffer_mapping->private_lock);
 990         }
 991         return ret;
 992 }
 993
 994 /*
 995  * Create the appropriate buffers when given a page for data area and
 996  * the size of each buffer.. Use the bh->b_this_page linked list to
 997  * follow the buffers created.  Return NULL if unable to create more
 998  * buffers.
 999  *
1000  * The retry flag is used to differentiate async IO (paging, swapping)
1001  * which may not fail from ordinary buffer allocations.
1002  */
1003 struct buffer_head *alloc_page_buffers(struct page *page, unsigned long size,
1004                 int retry)
1005 {
1006         struct buffer_head *bh, *head;
1007         long offset;
1008
1009 try_again:
1010         head = NULL;
1011         offset = PAGE_SIZE;
1012         while ((offset -= size) >= 0) {
1013                 bh = alloc_buffer_head(GFP_NOFS);
1014                 if (!bh)
1015                         goto no_grow;
1016
1017                 bh->b_bdev = NULL;
1018                 bh->b_this_page = head;
1019                 bh->b_blocknr = -1;
1020                 head = bh;
1021
1022                 bh->b_state = 0;
1023                 atomic_set(&bh->b_count, 0);
1024                 bh->b_size = size;
1025
1026                 /* Link the buffer to its page */
1027                 set_bh_page(bh, page, offset);
1028
1029                 bh->b_end_io = NULL;
1030         }
1031         return head;
1032 /*
1033  * In case anything failed, we just free everything we got.
1034  */
1035 no_grow:
1036         if (head) {
1037                 do {
1038                         bh = head;
1039                         head = head->b_this_page;
1040                         free_buffer_head(bh);
1041                 } while (head);
1042         }
1043
1044         /*
1045          * Return failure for non-async IO requests.  Async IO requests
1046          * are not allowed to fail, so we have to wait until buffer heads
1047          * become available.  But we don't want tasks sleeping with
1048          * partially complete buffers, so all were released above.
1049          */
1050         if (!retry)
1051                 return NULL;
1052
1053         /* We're _really_ low on memory. Now we just
1054          * wait for old buffer heads to become free due to
1055          * finishing IO.  Since this is an async request and
1056          * the reserve list is empty, we're sure there are
1057          * async buffer heads in use.
1058          */
1059         free_more_memory();
1060         goto try_again;
1061 }
1062 EXPORT_SYMBOL_GPL(alloc_page_buffers);
1063
1064 static inline void
1065 link_dev_buffers(struct page *page, struct buffer_head *head)
1066 {
1067         struct buffer_head *bh, *tail;
1068
1069         bh = head;
1070         do {
1071                 tail = bh;
1072                 bh = bh->b_this_page;
1073         } while (bh);
1074         tail->b_this_page = head;
1075         attach_page_buffers(page, head);
1076 }
1077
1078 /*
1079  * Initialise the state of a blockdev page's buffers.
1080  */
1081 static void
1082 init_page_buffers(struct page *page, struct block_device *bdev,
1083                         sector_t block, int size)
1084 {
1085         struct buffer_head *head = page_buffers(page);
1086         struct buffer_head *bh = head;
1087         int uptodate = PageUptodate(page);
1088
1089         do {
1090                 if (!buffer_mapped(bh)) {
1091                         init_buffer(bh, NULL, NULL);
1092                         bh->b_bdev = bdev;
1093                         bh->b_blocknr = block;
1094                         if (uptodate)
1095                                 set_buffer_uptodate(bh);
1096                         set_buffer_mapped(bh);
1097                 }
1098                 block++;
1099                 bh = bh->b_this_page;
1100         } while (bh != head);
1101 }
1102
1103 /*
1104  * Create the page-cache page that contains the requested block.
1105  *
1106  * This is user purely for blockdev mappings.
1107  */
1108 static struct page *
1109 grow_dev_page(struct block_device *bdev, sector_t block,
1110                 pgoff_t index, int size)
1111 {
1112         struct inode *inode = bdev->bd_inode;
1113         struct page *page;
1114         struct buffer_head *bh;
1115
1116         page = find_or_create_page(inode->i_mapping, index, GFP_NOFS);
1117         if (!page)
1118                 return NULL;
1119
1120         if (!PageLocked(page))
1121                 BUG();
1122
1123         if (page_has_buffers(page)) {
1124                 bh = page_buffers(page);
1125                 if (bh->b_size == size) {
1126                         init_page_buffers(page, bdev, block, size);
1127                         return page;
1128                 }
1129                 if (!try_to_free_buffers(page))
1130                         goto failed;
1131         }
1132
1133         /*
1134          * Allocate some buffers for this page
1135          */
1136         bh = alloc_page_buffers(page, size, 0);
1137         if (!bh)
1138                 goto failed;
1139
1140         /*
1141          * Link the page to the buffers and initialise them.  Take the
1142          * lock to be atomic wrt __find_get_block(), which does not
1143          * run under the page lock.
1144          */
1145         spin_lock(&inode->i_mapping->private_lock);
1146         link_dev_buffers(page, bh);
1147         init_page_buffers(page, bdev, block, size);
1148         spin_unlock(&inode->i_mapping->private_lock);
1149         return page;
1150
1151 failed:
1152         BUG();
1153         unlock_page(page);
1154         page_cache_release(page);
1155         return NULL;
1156 }
1157
1158 /*
1159  * Create buffers for the specified block device block's page.  If
1160  * that page was dirty, the buffers are set dirty also.
1161  *
1162  * Except that's a bug.  Attaching dirty buffers to a dirty
1163  * blockdev's page can result in filesystem corruption, because
1164  * some of those buffers may be aliases of filesystem data.
1165  * grow_dev_page() will go BUG() if this happens.
1166  */
1167 static inline int
1168 grow_buffers(struct block_device *bdev, sector_t block, int size)
1169 {
1170         struct page *page;
1171         pgoff_t index;
1172         int sizebits;
1173
1174         sizebits = -1;
1175         do {
1176                 sizebits++;
1177         } while ((size << sizebits) < PAGE_SIZE);
1178
1179         index = block >> sizebits;
1180         block = index << sizebits;
1181
1182         /* Create a page with the proper size buffers.. */
1183         page = grow_dev_page(bdev, block, index, size);
1184         if (!page)
1185                 return 0;
1186         unlock_page(page);
1187         page_cache_release(page);
1188         return 1;
1189 }
1190
1191 static struct buffer_head *
1192 __getblk_slow(struct block_device *bdev, sector_t block, int size)
1193 {
1194         /* Size must be multiple of hard sectorsize */
1195         if (unlikely(size & (bdev_hardsect_size(bdev)-1) ||
1196                         (size < 512 || size > PAGE_SIZE))) {
1197                 printk(KERN_ERR "getblk(): invalid block size %d requested\n",
1198                                         size);
1199                 printk(KERN_ERR "hardsect size: %d\n",
1200                                         bdev_hardsect_size(bdev));
1201
1202                 dump_stack();
1203                 return NULL;
1204         }
1205
1206         for (;;) {
1207                 struct buffer_head * bh;
1208
1209                 bh = __find_get_block(bdev, block, size);
1210                 if (bh)
1211                         return bh;
1212
1213                 if (!grow_buffers(bdev, block, size))
1214                         free_more_memory();
1215         }
1216 }
1217
1218 /*
1219  * The relationship between dirty buffers and dirty pages:
1220  *
1221  * Whenever a page has any dirty buffers, the page's dirty bit is set, and
1222  * the page is tagged dirty in its radix tree.
1223  *
1224  * At all times, the dirtiness of the buffers represents the dirtiness of
1225  * subsections of the page.  If the page has buffers, the page dirty bit is
1226  * merely a hint about the true dirty state.
1227  *
1228  * When a page is set dirty in its entirety, all its buffers are marked dirty
1229  * (if the page has buffers).
1230  *
1231  * When a buffer is marked dirty, its page is dirtied, but the page's other
1232  * buffers are not.
1233  *
1234  * Also.  When blockdev buffers are explicitly read with bread(), they
1235  * individually become uptodate.  But their backing page remains not
1236  * uptodate - even if all of its buffers are uptodate.  A subsequent
1237  * block_read_full_page() against that page will discover all the uptodate
1238  * buffers, will set the page uptodate and will perform no I/O.
1239  */
1240
1241 /**
1242  * mark_buffer_dirty - mark a buffer_head as needing writeout
1243  * @bh: the buffer_head to mark dirty
1244  *
1245  * mark_buffer_dirty() will set the dirty bit against the buffer, then set its
1246  * backing page dirty, then tag the page as dirty in its address_space's radix
1247  * tree and then attach the address_space's inode to its superblock's dirty
1248  * inode list.
1249  *
1250  * mark_buffer_dirty() is atomic.  It takes bh->b_page->mapping->private_lock,
1251  * mapping->tree_lock and the global inode_lock.
1252  */
1253 void fastcall mark_buffer_dirty(struct buffer_head *bh)
1254 {
1255         if (!buffer_dirty(bh) && !test_set_buffer_dirty(bh))
1256                 __set_page_dirty_nobuffers(bh->b_page);
1257 }
1258
1259 /*
1260  * Decrement a buffer_head's reference count.  If all buffers against a page
1261  * have zero reference count, are clean and unlocked, and if the page is clean
1262  * and unlocked then try_to_free_buffers() may strip the buffers from the page
1263  * in preparation for freeing it (sometimes, rarely, buffers are removed from
1264  * a page but it ends up not being freed, and buffers may later be reattached).
1265  */
1266 void __brelse(struct buffer_head * buf)
1267 {
1268         if (atomic_read(&buf->b_count)) {
1269                 put_bh(buf);
1270                 return;
1271         }
1272         printk(KERN_ERR "VFS: brelse: Trying to free free buffer\n");
1273         WARN_ON(1);
1274 }
1275
1276 /*
1277  * bforget() is like brelse(), except it discards any
1278  * potentially dirty data.
1279  */
1280 void __bforget(struct buffer_head *bh)
1281 {
1282         clear_buffer_dirty(bh);
1283         if (!list_empty(&bh->b_assoc_buffers)) {
1284                 struct address_space *buffer_mapping = bh->b_page->mapping;
1285
1286                 spin_lock(&buffer_mapping->private_lock);
1287                 list_del_init(&bh->b_assoc_buffers);
1288                 spin_unlock(&buffer_mapping->private_lock);
1289         }
1290         __brelse(bh);
1291 }
1292
1293 static struct buffer_head *__bread_slow(struct buffer_head *bh)
1294 {
1295         lock_buffer(bh);
1296         if (buffer_uptodate(bh)) {
1297                 unlock_buffer(bh);
1298                 return bh;
1299         } else {
1300                 get_bh(bh);
1301                 bh->b_end_io = end_buffer_read_sync;
1302                 submit_bh(READ, bh);
1303                 wait_on_buffer(bh);
1304                 if (buffer_uptodate(bh))
1305                         return bh;
1306         }
1307         brelse(bh);
1308         return NULL;
1309 }
1310
1311 /*
1312  * Per-cpu buffer LRU implementation.  To reduce the cost of __find_get_block().
1313  * The bhs[] array is sorted - newest buffer is at bhs[0].  Buffers have their
1314  * refcount elevated by one when they're in an LRU.  A buffer can only appear
1315  * once in a particular CPU's LRU.  A single buffer can be present in multiple
1316  * CPU's LRUs at the same time.
1317  *
1318  * This is a transparent caching front-end to sb_bread(), sb_getblk() and
1319  * sb_find_get_block().
1320  *
1321  * The LRUs themselves only need locking against invalidate_bh_lrus.  We use
1322  * a local interrupt disable for that.
1323  */
1324
1325 #define BH_LRU_SIZE     8
1326
1327 struct bh_lru {
1328         struct buffer_head *bhs[BH_LRU_SIZE];
1329 };
1330
1331 static DEFINE_PER_CPU(struct bh_lru, bh_lrus) = {{ NULL }};
1332
1333 #ifdef CONFIG_SMP
1334 #define bh_lru_lock()   local_irq_disable()
1335 #define bh_lru_unlock() local_irq_enable()
1336 #else
1337 #define bh_lru_lock()   preempt_disable()
1338 #define bh_lru_unlock() preempt_enable()
1339 #endif
1340
1341 static inline void check_irqs_on(void)
1342 {
1343 #ifdef irqs_disabled
1344         BUG_ON(irqs_disabled());
1345 #endif
1346 }
1347
1348 /*
1349  * The LRU management algorithm is dopey-but-simple.  Sorry.
1350  */
1351 static void bh_lru_install(struct buffer_head *bh)
1352 {
1353         struct buffer_head *evictee = NULL;
1354         struct bh_lru *lru;
1355
1356         check_irqs_on();
1357         bh_lru_lock();
1358         lru = &__get_cpu_var(bh_lrus);
1359         if (lru->bhs[0] != bh) {
1360                 struct buffer_head *bhs[BH_LRU_SIZE];
1361                 int in;
1362                 int out = 0;
1363
1364                 get_bh(bh);
1365                 bhs[out++] = bh;
1366                 for (in = 0; in < BH_LRU_SIZE; in++) {
1367                         struct buffer_head *bh2 = lru->bhs[in];
1368
1369                         if (bh2 == bh) {
1370                                 __brelse(bh2);
1371                         } else {
1372                                 if (out >= BH_LRU_SIZE) {
1373                                         BUG_ON(evictee != NULL);
1374                                         evictee = bh2;
1375                                 } else {
1376                                         bhs[out++] = bh2;
1377                                 }
1378                         }
1379                 }
1380                 while (out < BH_LRU_SIZE)
1381                         bhs[out++] = NULL;
1382                 memcpy(lru->bhs, bhs, sizeof(bhs));
1383         }
1384         bh_lru_unlock();
1385
1386         if (evictee)
1387                 __brelse(evictee);
1388 }
1389
1390 /*
1391  * Look up the bh in this cpu's LRU.  If it's there, move it to the head.
1392  */
1393 static inline struct buffer_head *
1394 lookup_bh_lru(struct block_device *bdev, sector_t block, int size)
1395 {
1396         struct buffer_head *ret = NULL;
1397         struct bh_lru *lru;
1398         int i;
1399
1400         check_irqs_on();
1401         bh_lru_lock();
1402         lru = &__get_cpu_var(bh_lrus);
1403         for (i = 0; i < BH_LRU_SIZE; i++) {
1404                 struct buffer_head *bh = lru->bhs[i];
1405
1406                 if (bh && bh->b_bdev == bdev &&
1407                                 bh->b_blocknr == block && bh->b_size == size) {
1408                         if (i) {
1409                                 while (i) {
1410                                         lru->bhs[i] = lru->bhs[i - 1];
1411                                         i--;
1412                                 }
1413                                 lru->bhs[0] = bh;
1414                         }
1415                         get_bh(bh);
1416                         ret = bh;
1417                         break;
1418                 }
1419         }
1420         bh_lru_unlock();
1421         return ret;
1422 }
1423
1424 /*
1425  * Perform a pagecache lookup for the matching buffer.  If it's there, refresh
1426  * it in the LRU and mark it as accessed.  If it is not present then return
1427  * NULL
1428  */
1429 struct buffer_head *
1430 __find_get_block(struct block_device *bdev, sector_t block, int size)
1431 {
1432         struct buffer_head *bh = lookup_bh_lru(bdev, block, size);
1433
1434         if (bh == NULL) {
1435                 bh = __find_get_block_slow(bdev, block);
1436                 if (bh)
1437                         bh_lru_install(bh);
1438         }
1439         if (bh)
1440                 touch_buffer(bh);
1441         return bh;
1442 }
1443 EXPORT_SYMBOL(__find_get_block);
1444
1445 /*
1446  * __getblk will locate (and, if necessary, create) the buffer_head
1447  * which corresponds to the passed block_device, block and size. The
1448  * returned buffer has its reference count incremented.
1449  *
1450  * __getblk() cannot fail - it just keeps trying.  If you pass it an
1451  * illegal block number, __getblk() will happily return a buffer_head
1452  * which represents the non-existent block.  Very weird.
1453  *
1454  * __getblk() will lock up the machine if grow_dev_page's try_to_free_buffers()
1455  * attempt is failing.  FIXME, perhaps?
1456  */
1457 struct buffer_head *
1458 __getblk(struct block_device *bdev, sector_t block, int size)
1459 {
1460         struct buffer_head *bh = __find_get_block(bdev, block, size);
1461
1462         might_sleep();
1463         if (bh == NULL)
1464                 bh = __getblk_slow(bdev, block, size);
1465         return bh;
1466 }
1467 EXPORT_SYMBOL(__getblk);
1468
1469 /*
1470  * Do async read-ahead on a buffer..
1471  */
1472 void __breadahead(struct block_device *bdev, sector_t block, int size)
1473 {
1474         struct buffer_head *bh = __getblk(bdev, block, size);
1475         if (likely(bh)) {
1476                 ll_rw_block(READA, 1, &bh);
1477                 brelse(bh);
1478         }
1479 }
1480 EXPORT_SYMBOL(__breadahead);
1481
1482 /**
1483  *  __bread() - reads a specified block and returns the bh
1484  *  @bdev: the block_device to read from
1485  *  @block: number of block
1486  *  @size: size (in bytes) to read
1487  *
1488  *  Reads a specified block, and returns buffer head that contains it.
1489  *  It returns NULL if the block was unreadable.
1490  */
1491 struct buffer_head *
1492 __bread(struct block_device *bdev, sector_t block, int size)
1493 {
1494         struct buffer_head *bh = __getblk(bdev, block, size);
1495
1496         if (likely(bh) && !buffer_uptodate(bh))
1497                 bh = __bread_slow(bh);
1498         return bh;
1499 }
1500 EXPORT_SYMBOL(__bread);
1501
1502 /*
1503  * invalidate_bh_lrus() is called rarely - but not only at unmount.
1504  * This doesn't race because it runs in each cpu either in irq
1505  * or with preempt disabled.
1506  */
1507 static void invalidate_bh_lru(void *arg)
1508 {
1509         struct bh_lru *b = &get_cpu_var(bh_lrus);
1510         int i;
1511
1512         for (i = 0; i < BH_LRU_SIZE; i++) {
1513                 brelse(b->bhs[i]);
1514                 b->bhs[i] = NULL;
1515         }
1516         put_cpu_var(bh_lrus);
1517 }
1518
1519 static void invalidate_bh_lrus(void)
1520 {
1521         on_each_cpu(invalidate_bh_lru, NULL, 1, 1);
1522 }
1523
1524 void set_bh_page(struct buffer_head *bh,
1525                 struct page *page, unsigned long offset)
1526 {
1527         bh->b_page = page;
1528         if (offset >= PAGE_SIZE)
1529                 BUG();
1530         if (PageHighMem(page))
1531                 /*
1532                  * This catches illegal uses and preserves the offset:
1533                  */
1534                 bh->b_data = (char *)(0 + offset);
1535         else
1536                 bh->b_data = page_address(page) + offset;
1537 }
1538 EXPORT_SYMBOL(set_bh_page);
1539
1540 /*
1541  * Called when truncating a buffer on a page completely.
1542  */
1543 static inline void discard_buffer(struct buffer_head * bh)
1544 {
1545         lock_buffer(bh);
1546         clear_buffer_dirty(bh);
1547         bh->b_bdev = NULL;
1548         clear_buffer_mapped(bh);
1549         clear_buffer_req(bh);
1550         clear_buffer_new(bh);
1551         clear_buffer_delay(bh);
1552         unlock_buffer(bh);
1553 }
1554
1555 /**
1556  * try_to_release_page() - release old fs-specific metadata on a page
1557  *
1558  * @page: the page which the kernel is trying to free
1559  * @gfp_mask: memory allocation flags (and I/O mode)
1560  *
1561  * The address_space is to try to release any data against the page
1562  * (presumably at page->private).  If the release was successful, return `1'.
1563  * Otherwise return zero.
1564  *
1565  * The @gfp_mask argument specifies whether I/O may be performed to release
1566  * this page (__GFP_IO), and whether the call may block (__GFP_WAIT).
1567  *
1568  * NOTE: @gfp_mask may go away, and this function may become non-blocking.
1569  */
1570 int try_to_release_page(struct page *page, gfp_t gfp_mask)
1571 {
1572         struct address_space * const mapping = page->mapping;
1573
1574         BUG_ON(!PageLocked(page));
1575         if (PageWriteback(page))
1576                 return 0;
1577
1578         if (mapping && mapping->a_ops->releasepage)
1579                 return mapping->a_ops->releasepage(page, gfp_mask);
1580         return try_to_free_buffers(page);
1581 }
1582 EXPORT_SYMBOL(try_to_release_page);
1583
1584 /**
1585  * block_invalidatepage - invalidate part of all of a buffer-backed page
1586  *
1587  * @page: the page which is affected
1588  * @offset: the index of the truncation point
1589  *
1590  * block_invalidatepage() is called when all or part of the page has become
1591  * invalidatedby a truncate operation.
1592  *
1593  * block_invalidatepage() does not have to release all buffers, but it must
1594  * ensure that no dirty buffer is left outside @offset and that no I/O
1595  * is underway against any of the blocks which are outside the truncation
1596  * point.  Because the caller is about to free (and possibly reuse) those
1597  * blocks on-disk.
1598  */
1599 int block_invalidatepage(struct page *page, unsigned long offset)
1600 {
1601         struct buffer_head *head, *bh, *next;
1602         unsigned int curr_off = 0;
1603         int ret = 1;
1604
1605         BUG_ON(!PageLocked(page));
1606         if (!page_has_buffers(page))
1607                 goto out;
1608
1609         head = page_buffers(page);
1610         bh = head;
1611         do {
1612                 unsigned int next_off = curr_off + bh->b_size;
1613                 next = bh->b_this_page;
1614
1615                 /*
1616                  * is this block fully invalidated?
1617                  */
1618                 if (offset <= curr_off)
1619                         discard_buffer(bh);
1620                 curr_off = next_off;
1621                 bh = next;
1622         } while (bh != head);
1623
1624         /*
1625          * We release buffers only if the entire page is being invalidated.
1626          * The get_block cached value has been unconditionally invalidated,
1627          * so real IO is not possible anymore.
1628          */
1629         if (offset == 0)
1630                 ret = try_to_release_page(page, 0);
1631 out:
1632         return ret;
1633 }
1634 EXPORT_SYMBOL(block_invalidatepage);
1635
1636 int do_invalidatepage(struct page *page, unsigned long offset)
1637 {
1638         int (*invalidatepage)(struct page *, unsigned long);
1639         invalidatepage = page->mapping->a_ops->invalidatepage;
1640         if (invalidatepage == NULL)
1641                 invalidatepage = block_invalidatepage;
1642         return (*invalidatepage)(page, offset);
1643 }
1644
1645 /*
1646  * We attach and possibly dirty the buffers atomically wrt
1647  * __set_page_dirty_buffers() via private_lock.  try_to_free_buffers
1648  * is already excluded via the page lock.
1649  */
1650 void create_empty_buffers(struct page *page,
1651                         unsigned long blocksize, unsigned long b_state)
1652 {
1653         struct buffer_head *bh, *head, *tail;
1654
1655         head = alloc_page_buffers(page, blocksize, 1);
1656         bh = head;
1657         do {
1658                 bh->b_state |= b_state;
1659                 tail = bh;
1660                 bh = bh->b_this_page;
1661         } while (bh);
1662         tail->b_this_page = head;
1663
1664         spin_lock(&page->mapping->private_lock);
1665         if (PageUptodate(page) || PageDirty(page)) {
1666                 bh = head;
1667                 do {
1668                         if (PageDirty(page))
1669                                 set_buffer_dirty(bh);
1670                         if (PageUptodate(page))
1671                                 set_buffer_uptodate(bh);
1672                         bh = bh->b_this_page;
1673                 } while (bh != head);
1674         }
1675         attach_page_buffers(page, head);
1676         spin_unlock(&page->mapping->private_lock);
1677 }
1678 EXPORT_SYMBOL(create_empty_buffers);
1679
1680 /*
1681  * We are taking a block for data and we don't want any output from any
1682  * buffer-cache aliases starting from return from that function and
1683  * until the moment when something will explicitly mark the buffer
1684  * dirty (hopefully that will not happen until we will free that block ;-)
1685  * We don't even need to mark it not-uptodate - nobody can expect
1686  * anything from a newly allocated buffer anyway. We used to used
1687  * unmap_buffer() for such invalidation, but that was wrong. We definitely
1688  * don't want to mark the alias unmapped, for example - it would confuse
1689  * anyone who might pick it with bread() afterwards...
1690  *
1691  * Also..  Note that bforget() doesn't lock the buffer.  So there can
1692  * be writeout I/O going on against recently-freed buffers.  We don't
1693  * wait on that I/O in bforget() - it's more efficient to wait on the I/O
1694  * only if we really need to.  That happens here.
1695  */
1696 void unmap_underlying_metadata(struct block_device *bdev, sector_t block)
1697 {
1698         struct buffer_head *old_bh;
1699
1700         might_sleep();
1701
1702         old_bh = __find_get_block_slow(bdev, block);
1703         if (old_bh) {
1704                 clear_buffer_dirty(old_bh);
1705                 wait_on_buffer(old_bh);
1706                 clear_buffer_req(old_bh);
1707                 __brelse(old_bh);
1708         }
1709 }
1710 EXPORT_SYMBOL(unmap_underlying_metadata);
1711
1712 /*
1713  * NOTE! All mapped/uptodate combinations are valid:
1714  *
1715  *      Mapped  Uptodate        Meaning
1716  *
1717  *      No      No              "unknown" - must do get_block()
1718  *      No      Yes             "hole" - zero-filled
1719  *      Yes     No              "allocated" - allocated on disk, not read in
1720  *      Yes     Yes             "valid" - allocated and up-to-date in memory.
1721  *
1722  * "Dirty" is valid only with the last case (mapped+uptodate).
1723  */
1724
1725 /*
1726  * While block_write_full_page is writing back the dirty buffers under
1727  * the page lock, whoever dirtied the buffers may decide to clean them
1728  * again at any time.  We handle that by only looking at the buffer
1729  * state inside lock_buffer().
1730  *
1731  * If block_write_full_page() is called for regular writeback
1732  * (wbc->sync_mode == WB_SYNC_NONE) then it will redirty a page which has a
1733  * locked buffer.   This only can happen if someone has written the buffer
1734  * directly, with submit_bh().  At the address_space level PageWriteback
1735  * prevents this contention from occurring.
1736  */
1737 static int __block_write_full_page(struct inode *inode, struct page *page,
1738                         get_block_t *get_block, struct writeback_control *wbc)
1739 {
1740         int err;
1741         sector_t block;
1742         sector_t last_block;
1743         struct buffer_head *bh, *head;
1744         int nr_underway = 0;
1745
1746         BUG_ON(!PageLocked(page));
1747
1748         last_block = (i_size_read(inode) - 1) >> inode->i_blkbits;
1749
1750         if (!page_has_buffers(page)) {
1751                 create_empty_buffers(page, 1 << inode->i_blkbits,
1752                                         (1 << BH_Dirty)|(1 << BH_Uptodate));
1753         }
1754
1755         /*
1756          * Be very careful.  We have no exclusion from __set_page_dirty_buffers
1757          * here, and the (potentially unmapped) buffers may become dirty at
1758          * any time.  If a buffer becomes dirty here after we've inspected it
1759          * then we just miss that fact, and the page stays dirty.
1760          *
1761          * Buffers outside i_size may be dirtied by __set_page_dirty_buffers;
1762          * handle that here by just cleaning them.
1763          */
1764
1765         block = (sector_t)page->index << (PAGE_CACHE_SHIFT - inode->i_blkbits);
1766         head = page_buffers(page);
1767         bh = head;
1768
1769         /*
1770          * Get all the dirty buffers mapped to disk addresses and
1771          * handle any aliases from the underlying blockdev's mapping.
1772          */
1773         do {
1774                 if (block > last_block) {
1775                         /*
1776                          * mapped buffers outside i_size will occur, because
1777                          * this page can be outside i_size when there is a
1778                          * truncate in progress.
1779                          */
1780                         /*
1781                          * The buffer was zeroed by block_write_full_page()
1782                          */
1783                         clear_buffer_dirty(bh);
1784                         set_buffer_uptodate(bh);
1785                 } else if (!buffer_mapped(bh) && buffer_dirty(bh)) {
1786                         err = get_block(inode, block, bh, 1);
1787                         if (err)
1788                                 goto recover;
1789                         if (buffer_new(bh)) {
1790                                 /* blockdev mappings never come here */
1791                                 clear_buffer_new(bh);
1792                                 unmap_underlying_metadata(bh->b_bdev,
1793                                                         bh->b_blocknr);
1794                         }
1795                 }
1796                 bh = bh->b_this_page;
1797                 block++;
1798         } while (bh != head);
1799
1800         do {
1801                 if (!buffer_mapped(bh))
1802                         continue;
1803                 /*
1804                  * If it's a fully non-blocking write attempt and we cannot
1805                  * lock the buffer then redirty the page.  Note that this can
1806                  * potentially cause a busy-wait loop from pdflush and kswapd
1807                  * activity, but those code paths have their own higher-level
1808                  * throttling.
1809                  */
1810                 if (wbc->sync_mode != WB_SYNC_NONE || !wbc->nonblocking) {
1811                         lock_buffer(bh);
1812                 } else if (test_set_buffer_locked(bh)) {
1813                         redirty_page_for_writepage(wbc, page);
1814                         continue;
1815                 }
1816                 if (test_clear_buffer_dirty(bh)) {
1817                         mark_buffer_async_write(bh);
1818                 } else {
1819                         unlock_buffer(bh);
1820                 }
1821         } while ((bh = bh->b_this_page) != head);
1822
1823         /*
1824          * The page and its buffers are protected by PageWriteback(), so we can
1825          * drop the bh refcounts early.
1826          */
1827         BUG_ON(PageWriteback(page));
1828         set_page_writeback(page);
1829
1830         do {
1831                 struct buffer_head *next = bh->b_this_page;
1832                 if (buffer_async_write(bh)) {
1833                         submit_bh(WRITE, bh);
1834                         nr_underway++;
1835                 }
1836                 bh = next;
1837         } while (bh != head);
1838         unlock_page(page);
1839
1840         err = 0;
1841 done:
1842         if (nr_underway == 0) {
1843                 /*
1844                  * The page was marked dirty, but the buffers were
1845                  * clean.  Someone wrote them back by hand with
1846                  * ll_rw_block/submit_bh.  A rare case.
1847                  */
1848                 int uptodate = 1;
1849                 do {
1850                         if (!buffer_uptodate(bh)) {
1851                                 uptodate = 0;
1852                                 break;
1853                         }
1854                         bh = bh->b_this_page;
1855                 } while (bh != head);
1856                 if (uptodate)
1857                         SetPageUptodate(page);
1858                 end_page_writeback(page);
1859                 /*
1860                  * The page and buffer_heads can be released at any time from
1861                  * here on.
1862                  */
1863                 wbc->pages_skipped++;   /* We didn't write this page */
1864         }
1865         return err;
1866
1867 recover:
1868         /*
1869          * ENOSPC, or some other error.  We may already have added some
1870          * blocks to the file, so we need to write these out to avoid
1871          * exposing stale data.
1872          * The page is currently locked and not marked for writeback
1873          */
1874         bh = head;
1875         /* Recovery: lock and submit the mapped buffers */
1876         do {
1877                 if (buffer_mapped(bh) && buffer_dirty(bh)) {
1878                         lock_buffer(bh);
1879                         mark_buffer_async_write(bh);
1880                 } else {
1881                         /*
1882                          * The buffer may have been set dirty during
1883                          * attachment to a dirty page.
1884                          */
1885                         clear_buffer_dirty(bh);
1886                 }
1887         } while ((bh = bh->b_this_page) != head);
1888         SetPageError(page);
1889         BUG_ON(PageWriteback(page));
1890         set_page_writeback(page);
1891         unlock_page(page);
1892         do {
1893                 struct buffer_head *next = bh->b_this_page;
1894                 if (buffer_async_write(bh)) {
1895                         clear_buffer_dirty(bh);
1896                         submit_bh(WRITE, bh);
1897                         nr_underway++;
1898                 }
1899                 bh = next;
1900         } while (bh != head);
1901         goto done;
1902 }
1903
1904 static int __block_prepare_write(struct inode *inode, struct page *page,
1905                 unsigned from, unsigned to, get_block_t *get_block)
1906 {
1907         unsigned block_start, block_end;
1908         sector_t block;
1909         int err = 0;
1910         unsigned blocksize, bbits;
1911         struct buffer_head *bh, *head, *wait[2], **wait_bh=wait;
1912
1913         BUG_ON(!PageLocked(page));
1914         BUG_ON(from > PAGE_CACHE_SIZE);
1915         BUG_ON(to > PAGE_CACHE_SIZE);
1916         BUG_ON(from > to);
1917
1918         blocksize = 1 << inode->i_blkbits;
1919         if (!page_has_buffers(page))
1920                 create_empty_buffers(page, blocksize, 0);
1921         head = page_buffers(page);
1922
1923         bbits = inode->i_blkbits;
1924         block = (sector_t)page->index << (PAGE_CACHE_SHIFT - bbits);
1925
1926         for(bh = head, block_start = 0; bh != head || !block_start;
1927             block++, block_start=block_end, bh = bh->b_this_page) {
1928                 block_end = block_start + blocksize;
1929                 if (block_end <= from || block_start >= to) {
1930                         if (PageUptodate(page)) {
1931                                 if (!buffer_uptodate(bh))
1932                                         set_buffer_uptodate(bh);
1933                         }
1934                         continue;
1935                 }
1936                 if (buffer_new(bh))
1937                         clear_buffer_new(bh);
1938                 if (!buffer_mapped(bh)) {
1939                         err = get_block(inode, block, bh, 1);
1940                         if (err)
1941                                 break;
1942                         if (buffer_new(bh)) {
1943                                 unmap_underlying_metadata(bh->b_bdev,
1944                                                         bh->b_blocknr);
1945                                 if (PageUptodate(page)) {
1946                                         set_buffer_uptodate(bh);
1947                                         continue;
1948                                 }
1949                                 if (block_end > to || block_start < from) {
1950                                         void *kaddr;
1951
1952                                         kaddr = kmap_atomic(page, KM_USER0);
1953                                         if (block_end > to)
1954                                                 memset(kaddr+to, 0,
1955                                                         block_end-to);
1956                                         if (block_start < from)
1957                                                 memset(kaddr+block_start,
1958                                                         0, from-block_start);
1959                                         flush_dcache_page(page);
1960                                         kunmap_atomic(kaddr, KM_USER0);
1961                                 }
1962                                 continue;
1963                         }
1964                 }
1965                 if (PageUptodate(page)) {
1966                         if (!buffer_uptodate(bh))
1967                                 set_buffer_uptodate(bh);
1968                         continue;
1969                 }
1970                 if (!buffer_uptodate(bh) && !buffer_delay(bh) &&
1971                      (block_start < from || block_end > to)) {
1972                         ll_rw_block(READ, 1, &bh);
1973                         *wait_bh++=bh;
1974                 }
1975         }
1976         /*
1977          * If we issued read requests - let them complete.
1978          */
1979         while(wait_bh > wait) {
1980                 wait_on_buffer(*--wait_bh);
1981                 if (!buffer_uptodate(*wait_bh))
1982                         err = -EIO;
1983         }
1984         if (!err) {
1985                 bh = head;
1986                 do {
1987                         if (buffer_new(bh))
1988                                 clear_buffer_new(bh);
1989                 } while ((bh = bh->b_this_page) != head);
1990                 return 0;
1991         }
1992         /* Error case: */
1993         /*
1994          * Zero out any newly allocated blocks to avoid exposing stale
1995          * data.  If BH_New is set, we know that the block was newly
1996          * allocated in the above loop.
1997          */
1998         bh = head;
1999         block_start = 0;
2000         do {
2001                 block_end = block_start+blocksize;
2002                 if (block_end <= from)
2003                         goto next_bh;
2004                 if (block_start >= to)
2005                         break;
2006                 if (buffer_new(bh)) {
2007                         void *kaddr;
2008
2009                         clear_buffer_new(bh);
2010                         kaddr = kmap_atomic(page, KM_USER0);
2011                         memset(kaddr+block_start, 0, bh->b_size);
2012                         kunmap_atomic(kaddr, KM_USER0);
2013                         set_buffer_uptodate(bh);
2014                         mark_buffer_dirty(bh);
2015                 }
2016 next_bh:
2017                 block_start = block_end;
2018                 bh = bh->b_this_page;
2019         } while (bh != head);
2020         return err;
2021 }
2022
2023 static int __block_commit_write(struct inode *inode, struct page *page,
2024                 unsigned from, unsigned to)
2025 {
2026         unsigned block_start, block_end;
2027         int partial = 0;
2028         unsigned blocksize;
2029         struct buffer_head *bh, *head;
2030
2031         blocksize = 1 << inode->i_blkbits;
2032
2033         for(bh = head = page_buffers(page), block_start = 0;
2034             bh != head || !block_start;
2035             block_start=block_end, bh = bh->b_this_page) {
2036                 block_end = block_start + blocksize;
2037                 if (block_end <= from || block_start >= to) {
2038                         if (!buffer_uptodate(bh))
2039                                 partial = 1;
2040                 } else {
2041                         set_buffer_uptodate(bh);
2042                         mark_buffer_dirty(bh);
2043                 }
2044         }
2045
2046         /*
2047          * If this is a partial write which happened to make all buffers
2048          * uptodate then we can optimize away a bogus readpage() for
2049          * the next read(). Here we 'discover' whether the page went
2050          * uptodate as a result of this (potentially partial) write.
2051          */
2052         if (!partial)
2053                 SetPageUptodate(page);
2054         return 0;
2055 }
2056
2057 /*
2058  * Generic "read page" function for block devices that have the normal
2059  * get_block functionality. This is most of the block device filesystems.
2060  * Reads the page asynchronously --- the unlock_buffer() and
2061  * set/clear_buffer_uptodate() functions propagate buffer state into the
2062  * page struct once IO has completed.
2063  */
2064 int block_read_full_page(struct page *page, get_block_t *get_block)
2065 {
2066         struct inode *inode = page->mapping->host;
2067         sector_t iblock, lblock;
2068         struct buffer_head *bh, *head, *arr[MAX_BUF_PER_PAGE];
2069         unsigned int blocksize;
2070         int nr, i;
2071         int fully_mapped = 1;
2072
2073         BUG_ON(!PageLocked(page));
2074         blocksize = 1 << inode->i_blkbits;
2075         if (!page_has_buffers(page))
2076                 create_empty_buffers(page, blocksize, 0);
2077         head = page_buffers(page);
2078
2079         iblock = (sector_t)page->index << (PAGE_CACHE_SHIFT - inode->i_blkbits);
2080         lblock = (i_size_read(inode)+blocksize-1) >> inode->i_blkbits;
2081         bh = head;
2082         nr = 0;
2083         i = 0;
2084
2085         do {
2086                 if (buffer_uptodate(bh))
2087                         continue;
2088
2089                 if (!buffer_mapped(bh)) {
2090                         int err = 0;
2091
2092                         fully_mapped = 0;
2093                         if (iblock < lblock) {
2094                                 err = get_block(inode, iblock, bh, 0);
2095                                 if (err)
2096                                         SetPageError(page);
2097                         }
2098                         if (!buffer_mapped(bh)) {
2099                                 void *kaddr = kmap_atomic(page, KM_USER0);
2100                                 memset(kaddr + i * blocksize, 0, blocksize);
2101                                 flush_dcache_page(page);
2102                                 kunmap_atomic(kaddr, KM_USER0);
2103                                 if (!err)
2104                                         set_buffer_uptodate(bh);
2105                                 continue;
2106                         }
2107                         /*
2108                          * get_block() might have updated the buffer
2109                          * synchronously
2110                          */
2111                         if (buffer_uptodate(bh))
2112                                 continue;
2113                 }
2114                 arr[nr++] = bh;
2115         } while (i++, iblock++, (bh = bh->b_this_page) != head);
2116
2117         if (fully_mapped)
2118                 SetPageMappedToDisk(page);
2119
2120         if (!nr) {
2121                 /*
2122                  * All buffers are uptodate - we can set the page uptodate
2123                  * as well. But not if get_block() returned an error.
2124                  */
2125                 if (!PageError(page))
2126                         SetPageUptodate(page);
2127                 unlock_page(page);
2128                 return 0;
2129         }
2130
2131         /* Stage two: lock the buffers */
2132         for (i = 0; i < nr; i++) {
2133                 bh = arr[i];
2134                 lock_buffer(bh);
2135                 mark_buffer_async_read(bh);
2136         }
2137
2138         /*
2139          * Stage 3: start the IO.  Check for uptodateness
2140          * inside the buffer lock in case another process reading
2141          * the underlying blockdev brought it uptodate (the sct fix).
2142          */
2143         for (i = 0; i < nr; i++) {
2144                 bh = arr[i];
2145                 if (buffer_uptodate(bh))
2146                         end_buffer_async_read(bh, 1);
2147                 else
2148                         submit_bh(READ, bh);
2149         }
2150         return 0;
2151 }
2152
2153 /* utility function for filesystems that need to do work on expanding
2154  * truncates.  Uses prepare/commit_write to allow the filesystem to
2155  * deal with the hole.
2156  */
2157 static int __generic_cont_expand(struct inode *inode, loff_t size,
2158                                  pgoff_t index, unsigned int offset)
2159 {
2160         struct address_space *mapping = inode->i_mapping;
2161         struct page *page;
2162         unsigned long limit;
2163         int err;
2164
2165         err = -EFBIG;
2166         limit = current->signal->rlim[RLIMIT_FSIZE].rlim_cur;
2167         if (limit != RLIM_INFINITY && size > (loff_t)limit) {
2168                 send_sig(SIGXFSZ, current, 0);
2169                 goto out;
2170         }
2171         if (size > inode->i_sb->s_maxbytes)
2172                 goto out;
2173
2174         err = -ENOMEM;
2175         page = grab_cache_page(mapping, index);
2176         if (!page)
2177                 goto out;
2178         err = mapping->a_ops->prepare_write(NULL, page, offset, offset);
2179         if (err) {
2180                 /*
2181                  * ->prepare_write() may have instantiated a few blocks
2182                  * outside i_size.  Trim these off again.
2183                  */
2184                 unlock_page(page);
2185                 page_cache_release(page);
2186                 vmtruncate(inode, inode->i_size);
2187                 goto out;
2188         }
2189
2190         err = mapping->a_ops->commit_write(NULL, page, offset, offset);
2191
2192         unlock_page(page);
2193         page_cache_release(page);
2194         if (err > 0)
2195                 err = 0;
2196 out:
2197         return err;
2198 }
2199
2200 int generic_cont_expand(struct inode *inode, loff_t size)
2201 {
2202         pgoff_t index;
2203         unsigned int offset;
2204
2205         offset = (size & (PAGE_CACHE_SIZE - 1)); /* Within page */
2206
2207         /* ugh.  in prepare/commit_write, if from==to==start of block, we
2208         ** skip the prepare.  make sure we never send an offset for the start
2209         ** of a block
2210         */
2211         if ((offset & (inode->i_sb->s_blocksize - 1)) == 0) {
2212                 /* caller must handle this extra byte. */
2213                 offset++;
2214         }
2215         index = size >> PAGE_CACHE_SHIFT;
2216
2217         return __generic_cont_expand(inode, size, index, offset);
2218 }
2219
2220 int generic_cont_expand_simple(struct inode *inode, loff_t size)
2221 {
2222         loff_t pos = size - 1;
2223         pgoff_t index = pos >> PAGE_CACHE_SHIFT;
2224         unsigned int offset = (pos & (PAGE_CACHE_SIZE - 1)) + 1;
2225
2226         /* prepare/commit_write can handle even if from==to==start of block. */
2227         return __generic_cont_expand(inode, size, index, offset);
2228 }
2229
2230 /*
2231  * For moronic filesystems that do not allow holes in file.
2232  * We may have to extend the file.
2233  */
2234
2235 int cont_prepare_write(struct page *page, unsigned offset,
2236                 unsigned to, get_block_t *get_block, loff_t *bytes)
2237 {
2238         struct address_space *mapping = page->mapping;
2239         struct inode *inode = mapping->host;
2240         struct page *new_page;
2241         pgoff_t pgpos;
2242         long status;
2243         unsigned zerofrom;
2244         unsigned blocksize = 1 << inode->i_blkbits;
2245         void *kaddr;
2246
2247         while(page->index > (pgpos = *bytes>>PAGE_CACHE_SHIFT)) {
2248                 status = -ENOMEM;
2249                 new_page = grab_cache_page(mapping, pgpos);
2250                 if (!new_page)
2251                         goto out;
2252                 /* we might sleep */
2253                 if (*bytes>>PAGE_CACHE_SHIFT != pgpos) {
2254                         unlock_page(new_page);
2255                         page_cache_release(new_page);
2256                         continue;
2257                 }
2258                 zerofrom = *bytes & ~PAGE_CACHE_MASK;
2259                 if (zerofrom & (blocksize-1)) {
2260                         *bytes |= (blocksize-1);
2261                         (*bytes)++;
2262                 }
2263                 status = __block_prepare_write(inode, new_page, zerofrom,
2264                                                 PAGE_CACHE_SIZE, get_block);
2265                 if (status)
2266                         goto out_unmap;
2267                 kaddr = kmap_atomic(new_page, KM_USER0);
2268                 memset(kaddr+zerofrom, 0, PAGE_CACHE_SIZE-zerofrom);
2269                 flush_dcache_page(new_page);
2270                 kunmap_atomic(kaddr, KM_USER0);
2271                 generic_commit_write(NULL, new_page, zerofrom, PAGE_CACHE_SIZE);
2272                 unlock_page(new_page);
2273                 page_cache_release(new_page);
2274         }
2275
2276         if (page->index < pgpos) {
2277                 /* completely inside the area */
2278                 zerofrom = offset;
2279         } else {
2280                 /* page covers the boundary, find the boundary offset */
2281                 zerofrom = *bytes & ~PAGE_CACHE_MASK;
2282
2283                 /* if we will expand the thing last block will be filled */
2284                 if (to > zerofrom && (zerofrom & (blocksize-1))) {
2285                         *bytes |= (blocksize-1);
2286                         (*bytes)++;
2287                 }
2288
2289                 /* starting below the boundary? Nothing to zero out */
2290                 if (offset <= zerofrom)
2291                         zerofrom = offset;
2292         }
2293         status = __block_prepare_write(inode, page, zerofrom, to, get_block);
2294         if (status)
2295                 goto out1;
2296         if (zerofrom < offset) {
2297                 kaddr = kmap_atomic(page, KM_USER0);
2298                 memset(kaddr+zerofrom, 0, offset-zerofrom);
2299                 flush_dcache_page(page);
2300                 kunmap_atomic(kaddr, KM_USER0);
2301                 __block_commit_write(inode, page, zerofrom, offset);
2302         }
2303         return 0;
2304 out1:
2305         ClearPageUptodate(page);
2306         return status;
2307
2308 out_unmap:
2309         ClearPageUptodate(new_page);
2310         unlock_page(new_page);
2311         page_cache_release(new_page);
2312 out:
2313         return status;
2314 }
2315
2316 int block_prepare_write(struct page *page, unsigned from, unsigned to,
2317                         get_block_t *get_block)
2318 {
2319         struct inode *inode = page->mapping->host;
2320         int err = __block_prepare_write(inode, page, from, to, get_block);
2321         if (err)
2322                 ClearPageUptodate(page);
2323         return err;
2324 }
2325
2326 int block_commit_write(struct page *page, unsigned from, unsigned to)
2327 {
2328         struct inode *inode = page->mapping->host;
2329         __block_commit_write(inode,page,from,to);
2330         return 0;
2331 }
2332
2333 int generic_commit_write(struct file *file, struct page *page,
2334                 unsigned from, unsigned to)
2335 {
2336         struct inode *inode = page->mapping->host;
2337         loff_t pos = ((loff_t)page->index << PAGE_CACHE_SHIFT) + to;
2338         __block_commit_write(inode,page,from,to);
2339         /*
2340          * No need to use i_size_read() here, the i_size
2341          * cannot change under us because we hold i_mutex.
2342          */
2343         if (pos > inode->i_size) {
2344                 i_size_write(inode, pos);
2345                 mark_inode_dirty(inode);
2346         }
2347         return 0;
2348 }
2349
2350
2351 /*
2352  * nobh_prepare_write()'s prereads are special: the buffer_heads are freed
2353  * immediately, while under the page lock.  So it needs a special end_io
2354  * handler which does not touch the bh after unlocking it.
2355  *
2356  * Note: unlock_buffer() sort-of does touch the bh after unlocking it, but
2357  * a race there is benign: unlock_buffer() only use the bh's address for
2358  * hashing after unlocking the buffer, so it doesn't actually touch the bh
2359  * itself.
2360  */
2361 static void end_buffer_read_nobh(struct buffer_head *bh, int uptodate)
2362 {
2363         if (uptodate) {
2364                 set_buffer_uptodate(bh);
2365         } else {
2366                 /* This happens, due to failed READA attempts. */
2367                 clear_buffer_uptodate(bh);
2368         }
2369         unlock_buffer(bh);
2370 }
2371
2372 /*
2373  * On entry, the page is fully not uptodate.
2374  * On exit the page is fully uptodate in the areas outside (from,to)
2375  */
2376 int nobh_prepare_write(struct page *page, unsigned from, unsigned to,
2377                         get_block_t *get_block)
2378 {
2379         struct inode *inode = page->mapping->host;
2380         const unsigned blkbits = inode->i_blkbits;
2381         const unsigned blocksize = 1 << blkbits;
2382         struct buffer_head map_bh;
2383         struct buffer_head *read_bh[MAX_BUF_PER_PAGE];
2384         unsigned block_in_page;
2385         unsigned block_start;
2386         sector_t block_in_file;
2387         char *kaddr;
2388         int nr_reads = 0;
2389         int i;
2390         int ret = 0;
2391         int is_mapped_to_disk = 1;
2392         int dirtied_it = 0;
2393
2394         if (PageMappedToDisk(page))
2395                 return 0;
2396
2397         block_in_file = (sector_t)page->index << (PAGE_CACHE_SHIFT - blkbits);
2398         map_bh.b_page = page;
2399
2400         /*
2401          * We loop across all blocks in the page, whether or not they are
2402          * part of the affected region.  This is so we can discover if the
2403          * page is fully mapped-to-disk.
2404          */
2405         for (block_start = 0, block_in_page = 0;
2406                   block_start < PAGE_CACHE_SIZE;
2407                   block_in_page++, block_start += blocksize) {
2408                 unsigned block_end = block_start + blocksize;
2409                 int create;
2410
2411                 map_bh.b_state = 0;
2412                 create = 1;
2413                 if (block_start >= to)
2414                         create = 0;
2415                 ret = get_block(inode, block_in_file + block_in_page,
2416                                         &map_bh, create);
2417                 if (ret)
2418                         goto failed;
2419                 if (!buffer_mapped(&map_bh))
2420                         is_mapped_to_disk = 0;
2421                 if (buffer_new(&map_bh))
2422                         unmap_underlying_metadata(map_bh.b_bdev,
2423                                                         map_bh.b_blocknr);
2424                 if (PageUptodate(page))
2425                         continue;
2426                 if (buffer_new(&map_bh) || !buffer_mapped(&map_bh)) {
2427                         kaddr = kmap_atomic(page, KM_USER0);
2428                         if (block_start < from) {
2429                                 memset(kaddr+block_start, 0, from-block_start);
2430                                 dirtied_it = 1;
2431                         }
2432                         if (block_end > to) {
2433                                 memset(kaddr + to, 0, block_end - to);
2434                                 dirtied_it = 1;
2435                         }
2436                         flush_dcache_page(page);
2437                         kunmap_atomic(kaddr, KM_USER0);
2438                         continue;
2439                 }
2440                 if (buffer_uptodate(&map_bh))
2441                         continue;       /* reiserfs does this */
2442                 if (block_start < from || block_end > to) {
2443                         struct buffer_head *bh = alloc_buffer_head(GFP_NOFS);
2444
2445                         if (!bh) {
2446                                 ret = -ENOMEM;
2447                                 goto failed;
2448                         }
2449                         bh->b_state = map_bh.b_state;
2450                         atomic_set(&bh->b_count, 0);
2451                         bh->b_this_page = NULL;
2452                         bh->b_page = page;
2453                         bh->b_blocknr = map_bh.b_blocknr;
2454                         bh->b_size = blocksize;
2455                         bh->b_data = (char *)(long)block_start;
2456                         bh->b_bdev = map_bh.b_bdev;
2457                         bh->b_private = NULL;
2458                         read_bh[nr_reads++] = bh;
2459                 }
2460         }
2461
2462         if (nr_reads) {
2463                 struct buffer_head *bh;
2464
2465                 /*
2466                  * The page is locked, so these buffers are protected from
2467                  * any VM or truncate activity.  Hence we don't need to care
2468                  * for the buffer_head refcounts.
2469                  */
2470                 for (i = 0; i < nr_reads; i++) {
2471                         bh = read_bh[i];
2472                         lock_buffer(bh);
2473                         bh->b_end_io = end_buffer_read_nobh;
2474                         submit_bh(READ, bh);
2475                 }
2476                 for (i = 0; i < nr_reads; i++) {
2477                         bh = read_bh[i];
2478                         wait_on_buffer(bh);
2479                         if (!buffer_uptodate(bh))
2480                                 ret = -EIO;
2481                         free_buffer_head(bh);
2482                         read_bh[i] = NULL;
2483                 }
2484                 if (ret)
2485                         goto failed;
2486         }
2487
2488         if (is_mapped_to_disk)
2489                 SetPageMappedToDisk(page);
2490         SetPageUptodate(page);
2491
2492         /*
2493          * Setting the page dirty here isn't necessary for the prepare_write
2494          * function - commit_write will do that.  But if/when this function is
2495          * used within the pagefault handler to ensure that all mmapped pages
2496          * have backing space in the filesystem, we will need to dirty the page
2497          * if its contents were altered.
2498          */
2499         if (dirtied_it)
2500                 set_page_dirty(page);
2501
2502         return 0;
2503
2504 failed:
2505         for (i = 0; i < nr_reads; i++) {
2506                 if (read_bh[i])
2507                         free_buffer_head(read_bh[i]);
2508         }
2509
2510         /*
2511          * Error recovery is pretty slack.  Clear the page and mark it dirty
2512          * so we'll later zero out any blocks which _were_ allocated.
2513          */
2514         kaddr = kmap_atomic(page, KM_USER0);
2515         memset(kaddr, 0, PAGE_CACHE_SIZE);
2516         kunmap_atomic(kaddr, KM_USER0);
2517         SetPageUptodate(page);
2518         set_page_dirty(page);
2519         return ret;
2520 }
2521 EXPORT_SYMBOL(nobh_prepare_write);
2522
2523 int nobh_commit_write(struct file *file, struct page *page,
2524                 unsigned from, unsigned to)
2525 {
2526         struct inode *inode = page->mapping->host;
2527         loff_t pos = ((loff_t)page->index << PAGE_CACHE_SHIFT) + to;
2528
2529         set_page_dirty(page);
2530         if (pos > inode->i_size) {
2531                 i_size_write(inode, pos);
2532                 mark_inode_dirty(inode);
2533         }
2534         return 0;
2535 }
2536 EXPORT_SYMBOL(nobh_commit_write);
2537
2538 /*
2539  * nobh_writepage() - based on block_full_write_page() except
2540  * that it tries to operate without attaching bufferheads to
2541  * the page.
2542  */
2543 int nobh_writepage(struct page *page, get_block_t *get_block,
2544                         struct writeback_control *wbc)
2545 {
2546         struct inode * const inode = page->mapping->host;
2547         loff_t i_size = i_size_read(inode);
2548         const pgoff_t end_index = i_size >> PAGE_CACHE_SHIFT;
2549         unsigned offset;
2550         void *kaddr;
2551         int ret;
2552
2553         /* Is the page fully inside i_size? */
2554         if (page->index < end_index)
2555                 goto out;
2556
2557         /* Is the page fully outside i_size? (truncate in progress) */
2558         offset = i_size & (PAGE_CACHE_SIZE-1);
2559         if (page->index >= end_index+1 || !offset) {
2560                 /*
2561                  * The page may have dirty, unmapped buffers.  For example,
2562                  * they may have been added in ext3_writepage().  Make them
2563                  * freeable here, so the page does not leak.
2564                  */
2565 #if 0
2566                 /* Not really sure about this  - do we need this ? */
2567                 if (page->mapping->a_ops->invalidatepage)
2568                         page->mapping->a_ops->invalidatepage(page, offset);
2569 #endif
2570                 unlock_page(page);
2571                 return 0; /* don't care */
2572         }
2573
2574         /*
2575          * The page straddles i_size.  It must be zeroed out on each and every
2576          * writepage invocation because it may be mmapped.  "A file is mapped
2577          * in multiples of the page size.  For a file that is not a multiple of
2578          * the  page size, the remaining memory is zeroed when mapped, and
2579          * writes to that region are not written out to the file."
2580          */
2581         kaddr = kmap_atomic(page, KM_USER0);
2582         memset(kaddr + offset, 0, PAGE_CACHE_SIZE - offset);
2583         flush_dcache_page(page);
2584         kunmap_atomic(kaddr, KM_USER0);
2585 out:
2586         ret = mpage_writepage(page, get_block, wbc);
2587         if (ret == -EAGAIN)
2588                 ret = __block_write_full_page(inode, page, get_block, wbc);
2589         return ret;
2590 }
2591 EXPORT_SYMBOL(nobh_writepage);
2592
2593 /*
2594  * This function assumes that ->prepare_write() uses nobh_prepare_write().
2595  */
2596 int nobh_truncate_page(struct address_space *mapping, loff_t from)
2597 {
2598         struct inode *inode = mapping->host;
2599         unsigned blocksize = 1 << inode->i_blkbits;
2600         pgoff_t index = from >> PAGE_CACHE_SHIFT;
2601         unsigned offset = from & (PAGE_CACHE_SIZE-1);
2602         unsigned to;
2603         struct page *page;
2604         struct address_space_operations *a_ops = mapping->a_ops;
2605         char *kaddr;
2606         int ret = 0;
2607
2608         if ((offset & (blocksize - 1)) == 0)
2609                 goto out;
2610
2611         ret = -ENOMEM;
2612         page = grab_cache_page(mapping, index);
2613         if (!page)
2614                 goto out;
2615
2616         to = (offset + blocksize) & ~(blocksize - 1);
2617         ret = a_ops->prepare_write(NULL, page, offset, to);
2618         if (ret == 0) {
2619                 kaddr = kmap_atomic(page, KM_USER0);
2620                 memset(kaddr + offset, 0, PAGE_CACHE_SIZE - offset);
2621                 flush_dcache_page(page);
2622                 kunmap_atomic(kaddr, KM_USER0);
2623                 set_page_dirty(page);
2624         }
2625         unlock_page(page);
2626         page_cache_release(page);
2627 out:
2628         return ret;
2629 }
2630 EXPORT_SYMBOL(nobh_truncate_page);
2631
2632 int block_truncate_page(struct address_space *mapping,
2633                         loff_t from, get_block_t *get_block)
2634 {
2635         pgoff_t index = from >> PAGE_CACHE_SHIFT;
2636         unsigned offset = from & (PAGE_CACHE_SIZE-1);
2637         unsigned blocksize;
2638         sector_t iblock;
2639         unsigned length, pos;
2640         struct inode *inode = mapping->host;
2641         struct page *page;
2642         struct buffer_head *bh;
2643         void *kaddr;
2644         int err;
2645
2646         blocksize = 1 << inode->i_blkbits;
2647         length = offset & (blocksize - 1);
2648
2649         /* Block boundary? Nothing to do */
2650         if (!length)
2651                 return 0;
2652
2653         length = blocksize - length;
2654         iblock = (sector_t)index << (PAGE_CACHE_SHIFT - inode->i_blkbits);
2655
2656         page = grab_cache_page(mapping, index);
2657         err = -ENOMEM;
2658         if (!page)
2659                 goto out;
2660
2661         if (!page_has_buffers(page))
2662                 create_empty_buffers(page, blocksize, 0);
2663
2664         /* Find the buffer that contains "offset" */
2665         bh = page_buffers(page);
2666         pos = blocksize;
2667         while (offset >= pos) {
2668                 bh = bh->b_this_page;
2669                 iblock++;
2670                 pos += blocksize;
2671         }
2672
2673         err = 0;
2674         if (!buffer_mapped(bh)) {
2675                 err = get_block(inode, iblock, bh, 0);
2676                 if (err)
2677                         goto unlock;
2678                 /* unmapped? It's a hole - nothing to do */
2679                 if (!buffer_mapped(bh))
2680                         goto unlock;
2681         }
2682
2683         /* Ok, it's mapped. Make sure it's up-to-date */
2684         if (PageUptodate(page))
2685                 set_buffer_uptodate(bh);
2686
2687         if (!buffer_uptodate(bh) && !buffer_delay(bh)) {
2688                 err = -EIO;
2689                 ll_rw_block(READ, 1, &bh);
2690                 wait_on_buffer(bh);
2691                 /* Uhhuh. Read error. Complain and punt. */
2692                 if (!buffer_uptodate(bh))
2693                         goto unlock;
2694         }
2695
2696         kaddr = kmap_atomic(page, KM_USER0);
2697         memset(kaddr + offset, 0, length);
2698         flush_dcache_page(page);
2699         kunmap_atomic(kaddr, KM_USER0);
2700
2701         mark_buffer_dirty(bh);
2702         err = 0;
2703
2704 unlock:
2705         unlock_page(page);
2706         page_cache_release(page);
2707 out:
2708         return err;
2709 }
2710
2711 /*
2712  * The generic ->writepage function for buffer-backed address_spaces
2713  */
2714 int block_write_full_page(struct page *page, get_block_t *get_block,
2715                         struct writeback_control *wbc)
2716 {
2717         struct inode * const inode = page->mapping->host;
2718         loff_t i_size = i_size_read(inode);
2719         const pgoff_t end_index = i_size >> PAGE_CACHE_SHIFT;
2720         unsigned offset;
2721         void *kaddr;
2722
2723         /* Is the page fully inside i_size? */
2724         if (page->index < end_index)
2725                 return __block_write_full_page(inode, page, get_block, wbc);
2726
2727         /* Is the page fully outside i_size? (truncate in progress) */
2728         offset = i_size & (PAGE_CACHE_SIZE-1);
2729         if (page->index >= end_index+1 || !offset) {
2730                 /*
2731                  * The page may have dirty, unmapped buffers.  For example,
2732                  * they may have been added in ext3_writepage().  Make them
2733                  * freeable here, so the page does not leak.
2734                  */
2735                 do_invalidatepage(page, 0);
2736                 unlock_page(page);
2737                 return 0; /* don't care */
2738         }
2739
2740         /*
2741          * The page straddles i_size.  It must be zeroed out on each and every
2742          * writepage invokation because it may be mmapped.  "A file is mapped
2743          * in multiples of the page size.  For a file that is not a multiple of
2744          * the  page size, the remaining memory is zeroed when mapped, and
2745          * writes to that region are not written out to the file."
2746          */
2747         kaddr = kmap_atomic(page, KM_USER0);
2748         memset(kaddr + offset, 0, PAGE_CACHE_SIZE - offset);
2749         flush_dcache_page(page);
2750         kunmap_atomic(kaddr, KM_USER0);
2751         return __block_write_full_page(inode, page, get_block, wbc);
2752 }
2753
2754 sector_t generic_block_bmap(struct address_space *mapping, sector_t block,
2755                             get_block_t *get_block)
2756 {
2757         struct buffer_head tmp;
2758         struct inode *inode = mapping->host;
2759         tmp.b_state = 0;
2760         tmp.b_blocknr = 0;
2761         get_block(inode, block, &tmp, 0);
2762         return tmp.b_blocknr;
2763 }
2764
2765 static int end_bio_bh_io_sync(struct bio *bio, unsigned int bytes_done, int err)
2766 {
2767         struct buffer_head *bh = bio->bi_private;
2768
2769         if (bio->bi_size)
2770                 return 1;
2771
2772         if (err == -EOPNOTSUPP) {
2773                 set_bit(BIO_EOPNOTSUPP, &bio->bi_flags);
2774                 set_bit(BH_Eopnotsupp, &bh->b_state);
2775         }
2776
2777         bh->b_end_io(bh, test_bit(BIO_UPTODATE, &bio->bi_flags));
2778         bio_put(bio);
2779         return 0;
2780 }
2781
2782 int submit_bh(int rw, struct buffer_head * bh)
2783 {
2784         struct bio *bio;
2785         int ret = 0;
2786
2787         BUG_ON(!buffer_locked(bh));
2788         BUG_ON(!buffer_mapped(bh));
2789         BUG_ON(!bh->b_end_io);
2790
2791         if (buffer_ordered(bh) && (rw == WRITE))
2792                 rw = WRITE_BARRIER;
2793
2794         /*
2795          * Only clear out a write error when rewriting, should this
2796          * include WRITE_SYNC as well?
2797          */
2798         if (test_set_buffer_req(bh) && (rw == WRITE || rw == WRITE_BARRIER))
2799                 clear_buffer_write_io_error(bh);
2800
2801         /*
2802          * from here on down, it's all bio -- do the initial mapping,
2803          * submit_bio -> generic_make_request may further map this bio around
2804          */
2805         bio = bio_alloc(GFP_NOIO, 1);
2806
2807         bio->bi_sector = bh->b_blocknr * (bh->b_size >> 9);
2808         bio->bi_bdev = bh->b_bdev;
2809         bio->bi_io_vec[0].bv_page = bh->b_page;
2810         bio->bi_io_vec[0].bv_len = bh->b_size;
2811         bio->bi_io_vec[0].bv_offset = bh_offset(bh);
2812
2813         bio->bi_vcnt = 1;
2814         bio->bi_idx = 0;
2815         bio->bi_size = bh->b_size;
2816
2817         bio->bi_end_io = end_bio_bh_io_sync;
2818         bio->bi_private = bh;
2819
2820         bio_get(bio);
2821         submit_bio(rw, bio);
2822
2823         if (bio_flagged(bio, BIO_EOPNOTSUPP))
2824                 ret = -EOPNOTSUPP;
2825
2826         bio_put(bio);
2827         return ret;
2828 }
2829
2830 /**
2831  * ll_rw_block: low-level access to block devices (DEPRECATED)
2832  * @rw: whether to %READ or %WRITE or %SWRITE or maybe %READA (readahead)
2833  * @nr: number of &struct buffer_heads in the array
2834  * @bhs: array of pointers to &struct buffer_head
2835  *
2836  * ll_rw_block() takes an array of pointers to &struct buffer_heads, and
2837  * requests an I/O operation on them, either a %READ or a %WRITE.  The third
2838  * %SWRITE is like %WRITE only we make sure that the *current* data in buffers
2839  * are sent to disk. The fourth %READA option is described in the documentation
2840  * for generic_make_request() which ll_rw_block() calls.
2841  *
2842  * This function drops any buffer that it cannot get a lock on (with the
2843  * BH_Lock state bit) unless SWRITE is required, any buffer that appears to be
2844  * clean when doing a write request, and any buffer that appears to be
2845  * up-to-date when doing read request.  Further it marks as clean buffers that
2846  * are processed for writing (the buffer cache won't assume that they are
2847  * actually clean until the buffer gets unlocked).
2848  *
2849  * ll_rw_block sets b_end_io to simple completion handler that marks
2850  * the buffer up-to-date (if approriate), unlocks the buffer and wakes
2851  * any waiters.
2852  *
2853  * All of the buffers must be for the same device, and must also be a
2854  * multiple of the current approved size for the device.
2855  */
2856 void ll_rw_block(int rw, int nr, struct buffer_head *bhs[])
2857 {
2858         int i;
2859
2860         for (i = 0; i < nr; i++) {
2861                 struct buffer_head *bh = bhs[i];
2862
2863                 if (rw == SWRITE)
2864                         lock_buffer(bh);
2865                 else if (test_set_buffer_locked(bh))
2866                         continue;
2867
2868                 get_bh(bh);
2869                 if (rw == WRITE || rw == SWRITE) {
2870                         if (test_clear_buffer_dirty(bh)) {
2871                                 bh->b_end_io = end_buffer_write_sync;
2872                                 submit_bh(WRITE, bh);
2873                                 continue;
2874                         }
2875                 } else {
2876                         if (!buffer_uptodate(bh)) {
2877                                 bh->b_end_io = end_buffer_read_sync;
2878                                 submit_bh(rw, bh);
2879                                 continue;
2880                         }
2881                 }
2882                 unlock_buffer(bh);
2883                 put_bh(bh);
2884         }
2885 }
2886
2887 /*
2888  * For a data-integrity writeout, we need to wait upon any in-progress I/O
2889  * and then start new I/O and then wait upon it.  The caller must have a ref on
2890  * the buffer_head.
2891  */
2892 int sync_dirty_buffer(struct buffer_head *bh)
2893 {
2894         int ret = 0;
2895
2896         WARN_ON(atomic_read(&bh->b_count) < 1);
2897         lock_buffer(bh);
2898         if (test_clear_buffer_dirty(bh)) {
2899                 get_bh(bh);
2900                 bh->b_end_io = end_buffer_write_sync;
2901                 ret = submit_bh(WRITE, bh);
2902                 wait_on_buffer(bh);
2903                 if (buffer_eopnotsupp(bh)) {
2904                         clear_buffer_eopnotsupp(bh);
2905                         ret = -EOPNOTSUPP;
2906                 }
2907                 if (!ret && !buffer_uptodate(bh))
2908                         ret = -EIO;
2909         } else {
2910                 unlock_buffer(bh);
2911         }
2912         return ret;
2913 }
2914
2915 /*
2916  * try_to_free_buffers() checks if all the buffers on this particular page
2917  * are unused, and releases them if so.
2918  *
2919  * Exclusion against try_to_free_buffers may be obtained by either
2920  * locking the page or by holding its mapping's private_lock.
2921  *
2922  * If the page is dirty but all the buffers are clean then we need to
2923  * be sure to mark the page clean as well.  This is because the page
2924  * may be against a block device, and a later reattachment of buffers
2925  * to a dirty page will set *all* buffers dirty.  Which would corrupt
2926  * filesystem data on the same device.
2927  *
2928  * The same applies to regular filesystem pages: if all the buffers are
2929  * clean then we set the page clean and proceed.  To do that, we require
2930  * total exclusion from __set_page_dirty_buffers().  That is obtained with
2931  * private_lock.
2932  *
2933  * try_to_free_buffers() is non-blocking.
2934  */
2935 static inline int buffer_busy(struct buffer_head *bh)
2936 {
2937         return atomic_read(&bh->b_count) |
2938                 (bh->b_state & ((1 << BH_Dirty) | (1 << BH_Lock)));
2939 }
2940
2941 static int
2942 drop_buffers(struct page *page, struct buffer_head **buffers_to_free)
2943 {
2944         struct buffer_head *head = page_buffers(page);
2945         struct buffer_head *bh;
2946
2947         bh = head;
2948         do {
2949                 if (buffer_write_io_error(bh) && page->mapping)
2950                         set_bit(AS_EIO, &page->mapping->flags);
2951                 if (buffer_busy(bh))
2952                         goto failed;
2953                 bh = bh->b_this_page;
2954         } while (bh != head);
2955
2956         do {
2957                 struct buffer_head *next = bh->b_this_page;
2958
2959                 if (!list_empty(&bh->b_assoc_buffers))
2960                         __remove_assoc_queue(bh);
2961                 bh = next;
2962         } while (bh != head);
2963         *buffers_to_free = head;
2964         __clear_page_buffers(page);
2965         return 1;
2966 failed:
2967         return 0;
2968 }
2969
2970 int try_to_free_buffers(struct page *page)
2971 {
2972         struct address_space * const mapping = page->mapping;
2973         struct buffer_head *buffers_to_free = NULL;
2974         int ret = 0;
2975
2976         BUG_ON(!PageLocked(page));
2977         if (PageWriteback(page))
2978                 return 0;
2979
2980         if (mapping == NULL) {          /* can this still happen? */
2981                 ret = drop_buffers(page, &buffers_to_free);
2982                 goto out;
2983         }
2984
2985         spin_lock(&mapping->private_lock);
2986         ret = drop_buffers(page, &buffers_to_free);
2987         if (ret) {
2988                 /*
2989                  * If the filesystem writes its buffers by hand (eg ext3)
2990                  * then we can have clean buffers against a dirty page.  We
2991                  * clean the page here; otherwise later reattachment of buffers
2992                  * could encounter a non-uptodate page, which is unresolvable.
2993                  * This only applies in the rare case where try_to_free_buffers
2994                  * succeeds but the page is not freed.
2995                  */
2996                 clear_page_dirty(page);
2997         }
2998         spin_unlock(&mapping->private_lock);
2999 out:
3000         if (buffers_to_free) {
3001                 struct buffer_head *bh = buffers_to_free;
3002
3003                 do {
3004                         struct buffer_head *next = bh->b_this_page;
3005                         free_buffer_head(bh);
3006                         bh = next;
3007                 } while (bh != buffers_to_free);
3008         }
3009         return ret;
3010 }
3011 EXPORT_SYMBOL(try_to_free_buffers);
3012
3013 int block_sync_page(struct page *page)
3014 {
3015         struct address_space *mapping;
3016
3017         smp_mb();
3018         mapping = page_mapping(page);
3019         if (mapping)
3020                 blk_run_backing_dev(mapping->backing_dev_info, page);
3021         return 0;
3022 }
3023
3024 /*
3025  * There are no bdflush tunables left.  But distributions are
3026  * still running obsolete flush daemons, so we terminate them here.
3027  *
3028  * Use of bdflush() is deprecated and will be removed in a future kernel.
3029  * The `pdflush' kernel threads fully replace bdflush daemons and this call.
3030  */
3031 asmlinkage long sys_bdflush(int func, long data)
3032 {
3033         static int msg_count;
3034
3035         if (!capable(CAP_SYS_ADMIN))
3036                 return -EPERM;
3037
3038         if (msg_count < 5) {
3039                 msg_count++;
3040                 printk(KERN_INFO
3041                         "warning: process `%s' used the obsolete bdflush"
3042                         " system call\n", current->comm);
3043                 printk(KERN_INFO "Fix your initscripts?\n");
3044         }
3045
3046         if (func == 1)
3047                 do_exit(0);
3048         return 0;
3049 }
3050
3051 /*
3052  * Buffer-head allocation
3053  */
3054 static kmem_cache_t *bh_cachep;
3055
3056 /*
3057  * Once the number of bh's in the machine exceeds this level, we start
3058  * stripping them in writeback.
3059  */
3060 static int max_buffer_heads;
3061
3062 int buffer_heads_over_limit;
3063
3064 struct bh_accounting {
3065         int nr;                 /* Number of live bh's */
3066         int ratelimit;          /* Limit cacheline bouncing */
3067 };
3068
3069 static DEFINE_PER_CPU(struct bh_accounting, bh_accounting) = {0, 0};
3070
3071 static void recalc_bh_state(void)
3072 {
3073         int i;
3074         int tot = 0;
3075
3076         if (__get_cpu_var(bh_accounting).ratelimit++ < 4096)
3077                 return;
3078         __get_cpu_var(bh_accounting).ratelimit = 0;
3079         for_each_cpu(i)
3080                 tot += per_cpu(bh_accounting, i).nr;
3081         buffer_heads_over_limit = (tot > max_buffer_heads);
3082 }
3083
3084 struct buffer_head *alloc_buffer_head(gfp_t gfp_flags)
3085 {
3086         struct buffer_head *ret = kmem_cache_alloc(bh_cachep, gfp_flags);
3087         if (ret) {
3088                 get_cpu_var(bh_accounting).nr++;
3089                 recalc_bh_state();
3090                 put_cpu_var(bh_accounting);
3091         }
3092         return ret;
3093 }
3094 EXPORT_SYMBOL(alloc_buffer_head);
3095
3096 void free_buffer_head(struct buffer_head *bh)
3097 {
3098         BUG_ON(!list_empty(&bh->b_assoc_buffers));
3099         kmem_cache_free(bh_cachep, bh);
3100         get_cpu_var(bh_accounting).nr--;
3101         recalc_bh_state();
3102         put_cpu_var(bh_accounting);
3103 }
3104 EXPORT_SYMBOL(free_buffer_head);
3105
3106 static void
3107 init_buffer_head(void *data, kmem_cache_t *cachep, unsigned long flags)
3108 {
3109         if ((flags & (SLAB_CTOR_VERIFY|SLAB_CTOR_CONSTRUCTOR)) ==
3110                             SLAB_CTOR_CONSTRUCTOR) {
3111                 struct buffer_head * bh = (struct buffer_head *)data;
3112
3113                 memset(bh, 0, sizeof(*bh));
3114                 INIT_LIST_HEAD(&bh->b_assoc_buffers);
3115         }
3116 }
3117
3118 #ifdef CONFIG_HOTPLUG_CPU
3119 static void buffer_exit_cpu(int cpu)
3120 {
3121         int i;
3122         struct bh_lru *b = &per_cpu(bh_lrus, cpu);
3123
3124         for (i = 0; i < BH_LRU_SIZE; i++) {
3125                 brelse(b->bhs[i]);
3126                 b->bhs[i] = NULL;
3127         }
3128 }
3129
3130 static int buffer_cpu_notify(struct notifier_block *self,
3131                               unsigned long action, void *hcpu)
3132 {
3133         if (action == CPU_DEAD)
3134                 buffer_exit_cpu((unsigned long)hcpu);
3135         return NOTIFY_OK;
3136 }
3137 #endif /* CONFIG_HOTPLUG_CPU */
3138
3139 void __init buffer_init(void)
3140 {
3141         int nrpages;
3142
3143         bh_cachep = kmem_cache_create("buffer_head",
3144                         sizeof(struct buffer_head), 0,
3145                         SLAB_RECLAIM_ACCOUNT|SLAB_PANIC, init_buffer_head, NULL);
3146
3147         /*
3148          * Limit the bh occupancy to 10% of ZONE_NORMAL
3149          */
3150         nrpages = (nr_free_buffer_pages() * 10) / 100;
3151         max_buffer_heads = nrpages * (PAGE_SIZE / sizeof(struct buffer_head));
3152         hotcpu_notifier(buffer_cpu_notify, 0);
3153 }
3154
3155 EXPORT_SYMBOL(__bforget);
3156 EXPORT_SYMBOL(__brelse);
3157 EXPORT_SYMBOL(__wait_on_buffer);
3158 EXPORT_SYMBOL(block_commit_write);
3159 EXPORT_SYMBOL(block_prepare_write);
3160 EXPORT_SYMBOL(block_read_full_page);
3161 EXPORT_SYMBOL(block_sync_page);
3162 EXPORT_SYMBOL(block_truncate_page);
3163 EXPORT_SYMBOL(block_write_full_page);
3164 EXPORT_SYMBOL(cont_prepare_write);
3165 EXPORT_SYMBOL(end_buffer_async_write);
3166 EXPORT_SYMBOL(end_buffer_read_sync);
3167 EXPORT_SYMBOL(end_buffer_write_sync);
3168 EXPORT_SYMBOL(file_fsync);
3169 EXPORT_SYMBOL(fsync_bdev);
3170 EXPORT_SYMBOL(generic_block_bmap);
3171 EXPORT_SYMBOL(generic_commit_write);
3172 EXPORT_SYMBOL(generic_cont_expand);
3173 EXPORT_SYMBOL(generic_cont_expand_simple);
3174 EXPORT_SYMBOL(init_buffer);
3175 EXPORT_SYMBOL(invalidate_bdev);
3176 EXPORT_SYMBOL(ll_rw_block);
3177 EXPORT_SYMBOL(mark_buffer_dirty);
3178 EXPORT_SYMBOL(submit_bh);
3179 EXPORT_SYMBOL(sync_dirty_buffer);
3180 EXPORT_SYMBOL(unlock_buffer);