Merge tag 'omap-for-v3.10/dt-fixes-for-merge-window' of git://git.kernel.org/pub...

author Olof Johansson <olof@lixom.net>

Thu, 9 May 2013 20:14:02 +0000 (13:14 -0700)

committer Olof Johansson <olof@lixom.net>

Thu, 9 May 2013 20:14:02 +0000 (13:14 -0700)
author Olof Johansson <olof@lixom.net>
Thu, 9 May 2013 20:14:02 +0000 (13:14 -0700)
committer Olof Johansson <olof@lixom.net>
Thu, 9 May 2013 20:14:02 +0000 (13:14 -0700)
diff --git a/Documentation/ABI/testing/sysfs-block-bcache b/Documentation/ABI/testing/sysfs-block-bcache

new file mode 100644 (file)

index 0000000..9e4bbc5
--- /dev/null
+++ b/Documentation/ABI/testing/sysfs-block-bcache
@@ -0,0 +1,156 @@
+What:          /sys/block/<disk>/bcache/unregister
+Date:          November 2010
+Contact:       Kent Overstreet <kent.overstreet@gmail.com>
+Description:
+               A write to this file causes the backing device or cache to be
+               unregistered. If a backing device had dirty data in the cache,
+               writeback mode is automatically disabled and all dirty data is
+               flushed before the device is unregistered. Caches unregister
+               all associated backing devices before unregistering themselves.
+
+What:          /sys/block/<disk>/bcache/clear_stats
+Date:          November 2010
+Contact:       Kent Overstreet <kent.overstreet@gmail.com>
+Description:
+               Writing to this file resets all the statistics for the device.
+
+What:          /sys/block/<disk>/bcache/cache
+Date:          November 2010
+Contact:       Kent Overstreet <kent.overstreet@gmail.com>
+Description:
+               For a backing device that has cache, a symlink to
+               the bcache/ dir of that cache.
+
+What:          /sys/block/<disk>/bcache/cache_hits
+Date:          November 2010
+Contact:       Kent Overstreet <kent.overstreet@gmail.com>
+Description:
+               For backing devices: integer number of full cache hits,
+               counted per bio. A partial cache hit counts as a miss.
+
+What:          /sys/block/<disk>/bcache/cache_misses
+Date:          November 2010
+Contact:       Kent Overstreet <kent.overstreet@gmail.com>
+Description:
+               For backing devices: integer number of cache misses.
+
+What:          /sys/block/<disk>/bcache/cache_hit_ratio
+Date:          November 2010
+Contact:       Kent Overstreet <kent.overstreet@gmail.com>
+Description:
+               For backing devices: cache hits as a percentage.
+
+What:          /sys/block/<disk>/bcache/sequential_cutoff
+Date:          November 2010
+Contact:       Kent Overstreet <kent.overstreet@gmail.com>
+Description:
+               For backing devices: Threshold past which sequential IO will
+               skip the cache. Read and written as bytes in human readable
+               units (i.e. echo 10M > sequntial_cutoff).
+
+What:          /sys/block/<disk>/bcache/bypassed
+Date:          November 2010
+Contact:       Kent Overstreet <kent.overstreet@gmail.com>
+Description:
+               Sum of all reads and writes that have bypassed the cache (due
+               to the sequential cutoff).  Expressed as bytes in human
+               readable units.
+
+What:          /sys/block/<disk>/bcache/writeback
+Date:          November 2010
+Contact:       Kent Overstreet <kent.overstreet@gmail.com>
+Description:
+               For backing devices: When on, writeback caching is enabled and
+               writes will be buffered in the cache. When off, caching is in
+               writethrough mode; reads and writes will be added to the
+               cache but no write buffering will take place.
+
+What:          /sys/block/<disk>/bcache/writeback_running
+Date:          November 2010
+Contact:       Kent Overstreet <kent.overstreet@gmail.com>
+Description:
+               For backing devices: when off, dirty data will not be written
+               from the cache to the backing device. The cache will still be
+               used to buffer writes until it is mostly full, at which point
+               writes transparently revert to writethrough mode. Intended only
+               for benchmarking/testing.
+
+What:          /sys/block/<disk>/bcache/writeback_delay
+Date:          November 2010
+Contact:       Kent Overstreet <kent.overstreet@gmail.com>
+Description:
+               For backing devices: In writeback mode, when dirty data is
+               written to the cache and the cache held no dirty data for that
+               backing device, writeback from cache to backing device starts
+               after this delay, expressed as an integer number of seconds.
+
+What:          /sys/block/<disk>/bcache/writeback_percent
+Date:          November 2010
+Contact:       Kent Overstreet <kent.overstreet@gmail.com>
+Description:
+               For backing devices: If nonzero, writeback from cache to
+               backing device only takes place when more than this percentage
+               of the cache is used, allowing more write coalescing to take
+               place and reducing total number of writes sent to the backing
+               device. Integer between 0 and 40.
+
+What:          /sys/block/<disk>/bcache/synchronous
+Date:          November 2010
+Contact:       Kent Overstreet <kent.overstreet@gmail.com>
+Description:
+               For a cache, a boolean that allows synchronous mode to be
+               switched on and off. In synchronous mode all writes are ordered
+               such that the cache can reliably recover from unclean shutdown;
+               if disabled bcache will not generally wait for writes to
+               complete but if the cache is not shut down cleanly all data
+               will be discarded from the cache. Should not be turned off with
+               writeback caching enabled.
+
+What:          /sys/block/<disk>/bcache/discard
+Date:          November 2010
+Contact:       Kent Overstreet <kent.overstreet@gmail.com>
+Description:
+               For a cache, a boolean allowing discard/TRIM to be turned off
+               or back on if the device supports it.
+
+What:          /sys/block/<disk>/bcache/bucket_size
+Date:          November 2010
+Contact:       Kent Overstreet <kent.overstreet@gmail.com>
+Description:
+               For a cache, bucket size in human readable units, as set at
+               cache creation time; should match the erase block size of the
+               SSD for optimal performance.
+
+What:          /sys/block/<disk>/bcache/nbuckets
+Date:          November 2010
+Contact:       Kent Overstreet <kent.overstreet@gmail.com>
+Description:
+               For a cache, the number of usable buckets.
+
+What:          /sys/block/<disk>/bcache/tree_depth
+Date:          November 2010
+Contact:       Kent Overstreet <kent.overstreet@gmail.com>
+Description:
+               For a cache, height of the btree excluding leaf nodes (i.e. a
+               one node tree will have a depth of 0).
+
+What:          /sys/block/<disk>/bcache/btree_cache_size
+Date:          November 2010
+Contact:       Kent Overstreet <kent.overstreet@gmail.com>
+Description:
+               Number of btree buckets/nodes that are currently cached in
+               memory; cache dynamically grows and shrinks in response to
+               memory pressure from the rest of the system.
+
+What:          /sys/block/<disk>/bcache/written
+Date:          November 2010
+Contact:       Kent Overstreet <kent.overstreet@gmail.com>
+Description:
+               For a cache, total amount of data in human readable units
+               written to the cache, excluding all metadata.
+
+What:          /sys/block/<disk>/bcache/btree_written
+Date:          November 2010
+Contact:       Kent Overstreet <kent.overstreet@gmail.com>
+Description:
+               For a cache, sum of all btree writes in human readable units.
diff --git a/Documentation/ABI/testing/sysfs-class-mtd b/Documentation/ABI/testing/sysfs-class-mtd

index 938ef71e2035e7c04203c7b65d2a120dbd96944a..3105644b3bfc45f27371765246f6d1deda46549b 100644 (file)
--- a/Documentation/ABI/testing/sysfs-class-mtd
+++ b/Documentation/ABI/testing/sysfs-class-mtd
@@ -14,8 +14,7 @@ Description:
                 The /sys/class/mtd/mtd{0,1,2,3,...} directories correspond
                 to each /dev/mtdX character device.  These may represent
                 physical/simulated flash devices, partitions on a flash
-               device, or concatenated flash devices.  They exist regardless
-               of whether CONFIG_MTD_CHAR is actually enabled.
+               device, or concatenated flash devices.
  
  What:          /sys/class/mtd/mtdXro/
  Date:          April 2009
@@ -23,8 +22,7 @@ KernelVersion:        2.6.29
  Contact:       linux-mtd@lists.infradead.org
  Description:
                 These directories provide the corresponding read-only device
-               nodes for /sys/class/mtd/mtdX/ .  They are only created
-               (for the benefit of udev) if CONFIG_MTD_CHAR is enabled.
+               nodes for /sys/class/mtd/mtdX/ .
  
  What:          /sys/class/mtd/mtdX/dev
  Date:          April 2009
diff --git a/Documentation/acpi/enumeration.txt b/Documentation/acpi/enumeration.txt

index b0d541042ac61c7d5cf580d4fc37a486ea96521e..d9be7a97dff35e7b1521e709e8a29c278d3fb434 100644 (file)
--- a/Documentation/acpi/enumeration.txt
+++ b/Documentation/acpi/enumeration.txt
@@ -66,6 +66,83 @@ the ACPI device explicitly to acpi_platform_device_ids list defined in
  drivers/acpi/acpi_platform.c. This limitation is only for the platform
  devices, SPI and I2C devices are created automatically as described below.
  
+DMA support
+~~~~~~~~~~~
+DMA controllers enumerated via ACPI should be registered in the system to
+provide generic access to their resources. For example, a driver that would
+like to be accessible to slave devices via generic API call
+dma_request_slave_channel() must register itself at the end of the probe
+function like this:
+
+       err = devm_acpi_dma_controller_register(dev, xlate_func, dw);
+       /* Handle the error if it's not a case of !CONFIG_ACPI */
+
+and implement custom xlate function if needed (usually acpi_dma_simple_xlate()
+is enough) which converts the FixedDMA resource provided by struct
+acpi_dma_spec into the corresponding DMA channel. A piece of code for that case
+could look like:
+
+       #ifdef CONFIG_ACPI
+       struct filter_args {
+               /* Provide necessary information for the filter_func */
+               ...
+       };
+
+       static bool filter_func(struct dma_chan *chan, void *param)
+       {
+               /* Choose the proper channel */
+               ...
+       }
+
+       static struct dma_chan *xlate_func(struct acpi_dma_spec *dma_spec,
+                       struct acpi_dma *adma)
+       {
+               dma_cap_mask_t cap;
+               struct filter_args args;
+
+               /* Prepare arguments for filter_func */
+               ...
+               return dma_request_channel(cap, filter_func, &args);
+       }
+       #else
+       static struct dma_chan *xlate_func(struct acpi_dma_spec *dma_spec,
+                       struct acpi_dma *adma)
+       {
+               return NULL;
+       }
+       #endif
+
+dma_request_slave_channel() will call xlate_func() for each registered DMA
+controller. In the xlate function the proper channel must be chosen based on
+information in struct acpi_dma_spec and the properties of the controller
+provided by struct acpi_dma.
+
+Clients must call dma_request_slave_channel() with the string parameter that
+corresponds to a specific FixedDMA resource. By default "tx" means the first
+entry of the FixedDMA resource array, "rx" means the second entry. The table
+below shows a layout:
+
+       Device (I2C0)
+       {
+               ...
+               Method (_CRS, 0, NotSerialized)
+               {
+                       Name (DBUF, ResourceTemplate ()
+                       {
+                               FixedDMA (0x0018, 0x0004, Width32bit, _Y48)
+                               FixedDMA (0x0019, 0x0005, Width32bit, )
+                       })
+               ...
+               }
+       }
+
+So, the FixedDMA with request line 0x0018 is "tx" and next one is "rx" in
+this example.
+
+In robust cases the client unfortunately needs to call
+acpi_dma_request_slave_chan_by_index() directly and therefore choose the
+specific FixedDMA resource by its index.
+
  SPI serial bus support
  ~~~~~~~~~~~~~~~~~~~~~~
  Slave devices behind SPI bus have SpiSerialBus resource attached to them.
diff --git a/Documentation/bcache.txt b/Documentation/bcache.txt

new file mode 100644 (file)

index 0000000..77db880
--- /dev/null
+++ b/Documentation/bcache.txt
@@ -0,0 +1,431 @@
+Say you've got a big slow raid 6, and an X-25E or three. Wouldn't it be
+nice if you could use them as cache... Hence bcache.
+
+Wiki and git repositories are at:
+  http://bcache.evilpiepirate.org
+  http://evilpiepirate.org/git/linux-bcache.git
+  http://evilpiepirate.org/git/bcache-tools.git
+
+It's designed around the performance characteristics of SSDs - it only allocates
+in erase block sized buckets, and it uses a hybrid btree/log to track cached
+extants (which can be anywhere from a single sector to the bucket size). It's
+designed to avoid random writes at all costs; it fills up an erase block
+sequentially, then issues a discard before reusing it.
+
+Both writethrough and writeback caching are supported. Writeback defaults to
+off, but can be switched on and off arbitrarily at runtime. Bcache goes to
+great lengths to protect your data - it reliably handles unclean shutdown. (It
+doesn't even have a notion of a clean shutdown; bcache simply doesn't return
+writes as completed until they're on stable storage).
+
+Writeback caching can use most of the cache for buffering writes - writing
+dirty data to the backing device is always done sequentially, scanning from the
+start to the end of the index.
+
+Since random IO is what SSDs excel at, there generally won't be much benefit
+to caching large sequential IO. Bcache detects sequential IO and skips it;
+it also keeps a rolling average of the IO sizes per task, and as long as the
+average is above the cutoff it will skip all IO from that task - instead of
+caching the first 512k after every seek. Backups and large file copies should
+thus entirely bypass the cache.
+
+In the event of a data IO error on the flash it will try to recover by reading
+from disk or invalidating cache entries.  For unrecoverable errors (meta data
+or dirty data), caching is automatically disabled; if dirty data was present
+in the cache it first disables writeback caching and waits for all dirty data
+to be flushed.
+
+Getting started:
+You'll need make-bcache from the bcache-tools repository. Both the cache device
+and backing device must be formatted before use.
+  make-bcache -B /dev/sdb
+  make-bcache -C /dev/sdc
+
+make-bcache has the ability to format multiple devices at the same time - if
+you format your backing devices and cache device at the same time, you won't
+have to manually attach:
+  make-bcache -B /dev/sda /dev/sdb -C /dev/sdc
+
+To make bcache devices known to the kernel, echo them to /sys/fs/bcache/register:
+
+  echo /dev/sdb > /sys/fs/bcache/register
+  echo /dev/sdc > /sys/fs/bcache/register
+
+To register your bcache devices automatically, you could add something like
+this to an init script:
+
+  echo /dev/sd* > /sys/fs/bcache/register_quiet
+
+It'll look for bcache superblocks and ignore everything that doesn't have one.
+
+Registering the backing device makes the bcache show up in /dev; you can now
+format it and use it as normal. But the first time using a new bcache device,
+it'll be running in passthrough mode until you attach it to a cache. See the
+section on attaching.
+
+The devices show up at /dev/bcacheN, and can be controlled via sysfs from
+/sys/block/bcacheN/bcache:
+
+  mkfs.ext4 /dev/bcache0
+  mount /dev/bcache0 /mnt
+
+Cache devices are managed as sets; multiple caches per set isn't supported yet
+but will allow for mirroring of metadata and dirty data in the future. Your new
+cache set shows up as /sys/fs/bcache/<UUID>
+
+ATTACHING:
+
+After your cache device and backing device are registered, the backing device
+must be attached to your cache set to enable caching. Attaching a backing
+device to a cache set is done thusly, with the UUID of the cache set in
+/sys/fs/bcache:
+
+  echo <UUID> > /sys/block/bcache0/bcache/attach
+
+This only has to be done once. The next time you reboot, just reregister all
+your bcache devices. If a backing device has data in a cache somewhere, the
+/dev/bcache# device won't be created until the cache shows up - particularly
+important if you have writeback caching turned on.
+
+If you're booting up and your cache device is gone and never coming back, you
+can force run the backing device:
+
+  echo 1 > /sys/block/sdb/bcache/running
+
+(You need to use /sys/block/sdb (or whatever your backing device is called), not
+/sys/block/bcache0, because bcache0 doesn't exist yet. If you're using a
+partition, the bcache directory would be at /sys/block/sdb/sdb2/bcache)
+
+The backing device will still use that cache set if it shows up in the future,
+but all the cached data will be invalidated. If there was dirty data in the
+cache, don't expect the filesystem to be recoverable - you will have massive
+filesystem corruption, though ext4's fsck does work miracles.
+
+ERROR HANDLING:
+
+Bcache tries to transparently handle IO errors to/from the cache device without
+affecting normal operation; if it sees too many errors (the threshold is
+configurable, and defaults to 0) it shuts down the cache device and switches all
+the backing devices to passthrough mode.
+
+ - For reads from the cache, if they error we just retry the read from the
+   backing device.
+
+ - For writethrough writes, if the write to the cache errors we just switch to
+   invalidating the data at that lba in the cache (i.e. the same thing we do for
+   a write that bypasses the cache)
+
+ - For writeback writes, we currently pass that error back up to the
+   filesystem/userspace. This could be improved - we could retry it as a write
+   that skips the cache so we don't have to error the write.
+
+ - When we detach, we first try to flush any dirty data (if we were running in
+   writeback mode). It currently doesn't do anything intelligent if it fails to
+   read some of the dirty data, though.
+
+TROUBLESHOOTING PERFORMANCE:
+
+Bcache has a bunch of config options and tunables. The defaults are intended to
+be reasonable for typical desktop and server workloads, but they're not what you
+want for getting the best possible numbers when benchmarking.
+
+ - Bad write performance
+
+   If write performance is not what you expected, you probably wanted to be
+   running in writeback mode, which isn't the default (not due to a lack of
+   maturity, but simply because in writeback mode you'll lose data if something
+   happens to your SSD)
+
+   # echo writeback > /sys/block/bcache0/cache_mode
+
+ - Bad performance, or traffic not going to the SSD that you'd expect
+
+   By default, bcache doesn't cache everything. It tries to skip sequential IO -
+   because you really want to be caching the random IO, and if you copy a 10
+   gigabyte file you probably don't want that pushing 10 gigabytes of randomly
+   accessed data out of your cache.
+
+   But if you want to benchmark reads from cache, and you start out with fio
+   writing an 8 gigabyte test file - so you want to disable that.
+
+   # echo 0 > /sys/block/bcache0/bcache/sequential_cutoff
+
+   To set it back to the default (4 mb), do
+
+   # echo 4M > /sys/block/bcache0/bcache/sequential_cutoff
+
+ - Traffic's still going to the spindle/still getting cache misses
+
+   In the real world, SSDs don't always keep up with disks - particularly with
+   slower SSDs, many disks being cached by one SSD, or mostly sequential IO. So
+   you want to avoid being bottlenecked by the SSD and having it slow everything
+   down.
+
+   To avoid that bcache tracks latency to the cache device, and gradually
+   throttles traffic if the latency exceeds a threshold (it does this by
+   cranking down the sequential bypass).
+
+   You can disable this if you need to by setting the thresholds to 0:
+
+   # echo 0 > /sys/fs/bcache/<cache set>/congested_read_threshold_us
+   # echo 0 > /sys/fs/bcache/<cache set>/congested_write_threshold_us
+
+   The default is 2000 us (2 milliseconds) for reads, and 20000 for writes.
+
+ - Still getting cache misses, of the same data
+
+   One last issue that sometimes trips people up is actually an old bug, due to
+   the way cache coherency is handled for cache misses. If a btree node is full,
+   a cache miss won't be able to insert a key for the new data and the data
+   won't be written to the cache.
+
+   In practice this isn't an issue because as soon as a write comes along it'll
+   cause the btree node to be split, and you need almost no write traffic for
+   this to not show up enough to be noticable (especially since bcache's btree
+   nodes are huge and index large regions of the device). But when you're
+   benchmarking, if you're trying to warm the cache by reading a bunch of data
+   and there's no other traffic - that can be a problem.
+
+   Solution: warm the cache by doing writes, or use the testing branch (there's
+   a fix for the issue there).
+
+SYSFS - BACKING DEVICE:
+
+attach
+  Echo the UUID of a cache set to this file to enable caching.
+
+cache_mode
+  Can be one of either writethrough, writeback, writearound or none.
+
+clear_stats
+  Writing to this file resets the running total stats (not the day/hour/5 minute
+  decaying versions).
+
+detach
+  Write to this file to detach from a cache set. If there is dirty data in the
+  cache, it will be flushed first.
+
+dirty_data
+  Amount of dirty data for this backing device in the cache. Continuously
+  updated unlike the cache set's version, but may be slightly off.
+
+label
+  Name of underlying device.
+
+readahead
+  Size of readahead that should be performed.  Defaults to 0.  If set to e.g.
+  1M, it will round cache miss reads up to that size, but without overlapping
+  existing cache entries.
+
+running
+  1 if bcache is running (i.e. whether the /dev/bcache device exists, whether
+  it's in passthrough mode or caching).
+
+sequential_cutoff
+  A sequential IO will bypass the cache once it passes this threshhold; the
+  most recent 128 IOs are tracked so sequential IO can be detected even when
+  it isn't all done at once.
+
+sequential_merge
+  If non zero, bcache keeps a list of the last 128 requests submitted to compare
+  against all new requests to determine which new requests are sequential
+  continuations of previous requests for the purpose of determining sequential
+  cutoff. This is necessary if the sequential cutoff value is greater than the
+  maximum acceptable sequential size for any single request. 
+
+state
+  The backing device can be in one of four different states:
+
+  no cache: Has never been attached to a cache set.
+
+  clean: Part of a cache set, and there is no cached dirty data.
+
+  dirty: Part of a cache set, and there is cached dirty data.
+
+  inconsistent: The backing device was forcibly run by the user when there was
+  dirty data cached but the cache set was unavailable; whatever data was on the
+  backing device has likely been corrupted.
+
+stop
+  Write to this file to shut down the bcache device and close the backing
+  device.
+
+writeback_delay
+  When dirty data is written to the cache and it previously did not contain
+  any, waits some number of seconds before initiating writeback. Defaults to
+  30.
+
+writeback_percent
+  If nonzero, bcache tries to keep around this percentage of the cache dirty by
+  throttling background writeback and using a PD controller to smoothly adjust
+  the rate.
+
+writeback_rate
+  Rate in sectors per second - if writeback_percent is nonzero, background
+  writeback is throttled to this rate. Continuously adjusted by bcache but may
+  also be set by the user.
+
+writeback_running
+  If off, writeback of dirty data will not take place at all. Dirty data will
+  still be added to the cache until it is mostly full; only meant for
+  benchmarking. Defaults to on.
+
+SYSFS - BACKING DEVICE STATS:
+
+There are directories with these numbers for a running total, as well as
+versions that decay over the past day, hour and 5 minutes; they're also
+aggregated in the cache set directory as well.
+
+bypassed
+  Amount of IO (both reads and writes) that has bypassed the cache
+
+cache_hits
+cache_misses
+cache_hit_ratio
+  Hits and misses are counted per individual IO as bcache sees them; a
+  partial hit is counted as a miss.
+
+cache_bypass_hits
+cache_bypass_misses
+  Hits and misses for IO that is intended to skip the cache are still counted,
+  but broken out here.
+
+cache_miss_collisions
+  Counts instances where data was going to be inserted into the cache from a
+  cache miss, but raced with a write and data was already present (usually 0
+  since the synchronization for cache misses was rewritten)
+
+cache_readaheads
+  Count of times readahead occured.
+
+SYSFS - CACHE SET:
+
+average_key_size
+  Average data per key in the btree.
+
+bdev<0..n>
+  Symlink to each of the attached backing devices.
+
+block_size
+  Block size of the cache devices.
+
+btree_cache_size
+  Amount of memory currently used by the btree cache
+
+bucket_size
+  Size of buckets
+
+cache<0..n>
+  Symlink to each of the cache devices comprising this cache set. 
+
+cache_available_percent
+  Percentage of cache device free.
+
+clear_stats
+  Clears the statistics associated with this cache
+
+dirty_data
+  Amount of dirty data is in the cache (updated when garbage collection runs).
+
+flash_vol_create
+  Echoing a size to this file (in human readable units, k/M/G) creates a thinly
+  provisioned volume backed by the cache set.
+
+io_error_halflife
+io_error_limit
+  These determines how many errors we accept before disabling the cache.
+  Each error is decayed by the half life (in # ios).  If the decaying count
+  reaches io_error_limit dirty data is written out and the cache is disabled.
+
+journal_delay_ms
+  Journal writes will delay for up to this many milliseconds, unless a cache
+  flush happens sooner. Defaults to 100.
+
+root_usage_percent
+  Percentage of the root btree node in use.  If this gets too high the node
+  will split, increasing the tree depth.
+
+stop
+  Write to this file to shut down the cache set - waits until all attached
+  backing devices have been shut down.
+
+tree_depth
+  Depth of the btree (A single node btree has depth 0).
+
+unregister
+  Detaches all backing devices and closes the cache devices; if dirty data is
+  present it will disable writeback caching and wait for it to be flushed.
+
+SYSFS - CACHE SET INTERNAL:
+
+This directory also exposes timings for a number of internal operations, with
+separate files for average duration, average frequency, last occurence and max
+duration: garbage collection, btree read, btree node sorts and btree splits.
+
+active_journal_entries
+  Number of journal entries that are newer than the index.
+
+btree_nodes
+  Total nodes in the btree.
+
+btree_used_percent
+  Average fraction of btree in use.
+
+bset_tree_stats
+  Statistics about the auxiliary search trees
+
+btree_cache_max_chain
+  Longest chain in the btree node cache's hash table
+
+cache_read_races
+  Counts instances where while data was being read from the cache, the bucket
+  was reused and invalidated - i.e. where the pointer was stale after the read
+  completed. When this occurs the data is reread from the backing device.
+
+trigger_gc
+  Writing to this file forces garbage collection to run.
+
+SYSFS - CACHE DEVICE:
+
+block_size
+  Minimum granularity of writes - should match hardware sector size.
+
+btree_written
+  Sum of all btree writes, in (kilo/mega/giga) bytes
+
+bucket_size
+  Size of buckets
+
+cache_replacement_policy
+  One of either lru, fifo or random.
+
+discard
+  Boolean; if on a discard/TRIM will be issued to each bucket before it is
+  reused. Defaults to off, since SATA TRIM is an unqueued command (and thus
+  slow).
+
+freelist_percent
+  Size of the freelist as a percentage of nbuckets. Can be written to to
+  increase the number of buckets kept on the freelist, which lets you
+  artificially reduce the size of the cache at runtime. Mostly for testing
+  purposes (i.e. testing how different size caches affect your hit rate), but
+  since buckets are discarded when they move on to the freelist will also make
+  the SSD's garbage collection easier by effectively giving it more reserved
+  space.
+
+io_errors
+  Number of errors that have occured, decayed by io_error_halflife.
+
+metadata_written
+  Sum of all non data writes (btree writes and all other metadata).
+
+nbuckets
+  Total buckets in this cache
+
+priority_stats
+  Statistics about how recently data in the cache has been accessed.  This can
+  reveal your working set size.
+
+written
+  Sum of all data that has been written to the cache; comparison with
+  btree_written gives the amount of write inflation in bcache.
diff --git a/Documentation/block/cfq-iosched.txt b/Documentation/block/cfq-iosched.txt

index a5eb7d19a65d241650e26a2b4e5a303437b46878..9887f0414c16642d204296d9c0b8abdc8096a5da 100644 (file)
--- a/Documentation/block/cfq-iosched.txt
+++ b/Documentation/block/cfq-iosched.txt
@@ -5,7 +5,7 @@ The main aim of CFQ scheduler is to provide a fair allocation of the disk
  I/O bandwidth for all the processes which requests an I/O operation.
  
  CFQ maintains the per process queue for the processes which request I/O
-operation(syncronous requests). In case of asynchronous requests, all the
+operation(synchronous requests). In case of asynchronous requests, all the
  requests from all the processes are batched together according to their
  process's I/O priority.
  
@@ -66,6 +66,47 @@ This parameter is used to set the timeout of synchronous requests. Default
  value of this is 124ms. In case to favor synchronous requests over asynchronous
  one, this value should be decreased relative to fifo_expire_async.
  
+group_idle
+-----------
+This parameter forces idling at the CFQ group level instead of CFQ
+queue level. This was introduced after after a bottleneck was observed
+in higher end storage due to idle on sequential queue and allow dispatch
+from a single queue. The idea with this parameter is that it can be run with
+slice_idle=0 and group_idle=8, so that idling does not happen on individual
+queues in the group but happens overall on the group and thus still keeps the
+IO controller working.
+Not idling on individual queues in the group will dispatch requests from
+multiple queues in the group at the same time and achieve higher throughput
+on higher end storage.
+
+Default value for this parameter is 8ms.
+
+latency
+-------
+This parameter is used to enable/disable the latency mode of the CFQ
+scheduler. If latency mode (called low_latency) is enabled, CFQ tries
+to recompute the slice time for each process based on the target_latency set
+for the system. This favors fairness over throughput. Disabling low
+latency (setting it to 0) ignores target latency, allowing each process in the
+system to get a full time slice.
+
+By default low latency mode is enabled.
+
+target_latency
+--------------
+This parameter is used to calculate the time slice for a process if cfq's
+latency mode is enabled. It will ensure that sync requests have an estimated
+latency. But if sequential workload is higher(e.g. sequential read),
+then to meet the latency constraints, throughput may decrease because of less
+time for each process to issue I/O request before the cfq queue is switched.
+
+Though this can be overcome by disabling the latency_mode, it may increase
+the read latency for some applications. This parameter allows for changing
+target_latency through the sysfs interface which can provide the balanced
+throughput and read latency.
+
+Default value for target_latency is 300ms.
+
  slice_async
  -----------
  This parameter is same as of slice_sync but for asynchronous queue. The
@@ -98,8 +139,8 @@ in the device exceeds this parameter. This parameter is used for synchronous
  request.
  
  In case of storage with several disk, this setting can limit the parallel
-processing of request. Therefore, increasing the value can imporve the
-performace although this can cause the latency of some I/O to increase due
+processing of request. Therefore, increasing the value can improve the
+performance although this can cause the latency of some I/O to increase due
  to more number of requests.
  
  CFQ Group scheduling
diff --git a/Documentation/cgroups/memory.txt b/Documentation/cgroups/memory.txt

index 09027a9fece5e4b58edd8d45046d39ec46ddf84c..ddf4f93967a94e1385d378ac257f5e919da77cfb 100644 (file)
--- a/Documentation/cgroups/memory.txt
+++ b/Documentation/cgroups/memory.txt
@@ -480,7 +480,9 @@ memory.stat file includes following statistics
  
  # per-memory cgroup local status
  cache          - # of bytes of page cache memory.
-rss            - # of bytes of anonymous and swap cache memory.
+rss            - # of bytes of anonymous and swap cache memory (includes
+               transparent hugepages).
+rss_huge       - # of bytes of anonymous transparent hugepages.
  mapped_file    - # of bytes of mapped file (includes tmpfs/shmem)
  pgpgin         - # of charging events to the memory cgroup. The charging
                 event happens each time a page is accounted as either mapped
diff --git a/Documentation/devicetree/bindings/dma/atmel-dma.txt b/Documentation/devicetree/bindings/dma/atmel-dma.txt

index 3c046ee6e8b5dd54e14f7fcc6fb8699792aa4b7d..c80e8a3402f0f5e744d323c0611dec9727f4930e 100644 (file)
--- a/Documentation/devicetree/bindings/dma/atmel-dma.txt
+++ b/Documentation/devicetree/bindings/dma/atmel-dma.txt
@@ -1,14 +1,39 @@
  * Atmel Direct Memory Access Controller (DMA)
  
  Required properties:
-- compatible: Should be "atmel,<chip>-dma"
-- reg: Should contain DMA registers location and length
-- interrupts: Should contain DMA interrupt
+- compatible: Should be "atmel,<chip>-dma".
+- reg: Should contain DMA registers location and length.
+- interrupts: Should contain DMA interrupt.
+- #dma-cells: Must be <2>, used to represent the number of integer cells in
+the dmas property of client devices.
  
-Examples:
+Example:
  
-dma@ffffec00 {
+dma0: dma@ffffec00 {
         compatible = "atmel,at91sam9g45-dma";
         reg = <0xffffec00 0x200>;
         interrupts = <21>;
+       #dma-cells = <2>;
+};
+
+DMA clients connected to the Atmel DMA controller must use the format
+described in the dma.txt file, using a three-cell specifier for each channel:
+a phandle plus two interger cells.
+The three cells in order are:
+
+1. A phandle pointing to the DMA controller.
+2. The memory interface (16 most significant bits), the peripheral interface
+(16 less significant bits).
+3. The peripheral identifier for the hardware handshaking interface. The
+identifier can be different for tx and rx.
+
+Example:
+
+i2c0@i2c@f8010000 {
+       compatible = "atmel,at91sam9x5-i2c";
+       reg = <0xf8010000 0x100>;
+       interrupts = <9 4 6>;
+       dmas = <&dma0 1 7>,
+              <&dma0 1 8>;
+       dma-names = "tx", "rx";
  };
diff --git a/Documentation/devicetree/bindings/mtd/gpmc-nand.txt b/Documentation/devicetree/bindings/mtd/gpmc-nand.txt

index e7f8d7ed47ebfabb67de32b9b611ed52aca4bf35..6a983c1d87cddfd97b39ec195e176ccf1ca14bb6 100644 (file)
--- a/Documentation/devicetree/bindings/mtd/gpmc-nand.txt
+++ b/Documentation/devicetree/bindings/mtd/gpmc-nand.txt
@@ -56,20 +56,20 @@ Example for an AM33xx board:
                         nand-bus-width = <16>;
                         ti,nand-ecc-opt = "bch8";
  
-                       gpmc,sync-clk = <0>;
-                       gpmc,cs-on = <0>;
-                       gpmc,cs-rd-off = <44>;
-                       gpmc,cs-wr-off = <44>;
-                       gpmc,adv-on = <6>;
-                       gpmc,adv-rd-off = <34>;
-                       gpmc,adv-wr-off = <44>;
-                       gpmc,we-off = <40>;
-                       gpmc,oe-off = <54>;
-                       gpmc,access = <64>;
-                       gpmc,rd-cycle = <82>;
-                       gpmc,wr-cycle = <82>;
-                       gpmc,wr-access = <40>;
-                       gpmc,wr-data-mux-bus = <0>;
+                       gpmc,sync-clk-ps = <0>;
+                       gpmc,cs-on-ns = <0>;
+                       gpmc,cs-rd-off-ns = <44>;
+                       gpmc,cs-wr-off-ns = <44>;
+                       gpmc,adv-on-ns = <6>;
+                       gpmc,adv-rd-off-ns = <34>;
+                       gpmc,adv-wr-off-ns = <44>;
+                       gpmc,we-off-ns = <40>;
+                       gpmc,oe-off-ns = <54>;
+                       gpmc,access-ns = <64>;
+                       gpmc,rd-cycle-ns = <82>;
+                       gpmc,wr-cycle-ns = <82>;
+                       gpmc,wr-access-ns = <40>;
+                       gpmc,wr-data-mux-bus-ns = <0>;
  
                         #address-cells = <1>;
                         #size-cells = <1>;
diff --git a/Documentation/devicetree/bindings/mtd/partition.txt b/Documentation/devicetree/bindings/mtd/partition.txt

index 6e1f61f1e789699082d7eef9d893a03280f2c039..9315ac96b49b224665b674f1d0109805aaf0c9a8 100644 (file)
--- a/Documentation/devicetree/bindings/mtd/partition.txt
+++ b/Documentation/devicetree/bindings/mtd/partition.txt
@@ -5,8 +5,12 @@ on platforms which have strong conventions about which portions of a flash are
  used for what purposes, but which don't use an on-flash partition table such
  as RedBoot.
  
-#address-cells & #size-cells must both be present in the mtd device and be
-equal to 1.
+#address-cells & #size-cells must both be present in the mtd device. There are
+two valid values for both:
+<1>: for partitions that require a single 32-bit cell to represent their
+     size/address (aka the value is below 4 GiB)
+<2>: for partitions that require two 32-bit cells to represent their
+     size/address (aka the value is 4 GiB or greater).
  
  Required properties:
  - reg : The partition's offset and size within the mtd bank.
@@ -36,3 +40,31 @@ flash@0 {
                 reg = <0x0100000 0x200000>;
         };
  };
+
+flash@1 {
+       #address-cells = <1>;
+       #size-cells = <2>;
+
+       /* a 4 GiB partition */
+       partition@0 {
+               label = "filesystem";
+               reg = <0x00000000 0x1 0x00000000>;
+       };
+};
+
+flash@2 {
+       #address-cells = <2>;
+       #size-cells = <2>;
+
+       /* an 8 GiB partition */
+       partition@0 {
+               label = "filesystem #1";
+               reg = <0x0 0x00000000 0x2 0x00000000>;
+       };
+
+       /* a 4 GiB partition */
+       partition@200000000 {
+               label = "filesystem #2";
+               reg = <0x2 0x00000000 0x1 0x00000000>;
+       };
+};
diff --git a/Documentation/devicetree/bindings/net/gpmc-eth.txt b/Documentation/devicetree/bindings/net/gpmc-eth.txt

index 24cb4e46f67504d2f655eae618202614aa2b42cc..ace4a64b3695930254570d742704eb2abc888c37 100644 (file)
--- a/Documentation/devicetree/bindings/net/gpmc-eth.txt
+++ b/Documentation/devicetree/bindings/net/gpmc-eth.txt
@@ -26,16 +26,16 @@ Required properties:
  - bank-width:          Address width of the device in bytes. GPMC supports 8-bit
                         and 16-bit devices and so must be either 1 or 2 bytes.
  - compatible:          Compatible string property for the ethernet child device.
-- gpmc,cs-on:          Chip-select assertion time
-- gpmc,cs-rd-off:      Chip-select de-assertion time for reads
-- gpmc,cs-wr-off:      Chip-select de-assertion time for writes
-- gpmc,oe-on:          Output-enable assertion time
-- gpmc,oe-off          Output-enable de-assertion time
-- gpmc,we-on:          Write-enable assertion time
-- gpmc,we-off:         Write-enable de-assertion time
-- gpmc,access:         Start cycle to first data capture (read access)
-- gpmc,rd-cycle:       Total read cycle time
-- gpmc,wr-cycle:       Total write cycle time
+- gpmc,cs-on-ns:       Chip-select assertion time
+- gpmc,cs-rd-off-ns:   Chip-select de-assertion time for reads
+- gpmc,cs-wr-off-ns:   Chip-select de-assertion time for writes
+- gpmc,oe-on-ns:       Output-enable assertion time
+- gpmc,oe-off-ns:      Output-enable de-assertion time
+- gpmc,we-on-ns:       Write-enable assertion time
+- gpmc,we-off-ns:      Write-enable de-assertion time
+- gpmc,access-ns:      Start cycle to first data capture (read access)
+- gpmc,rd-cycle-ns:    Total read cycle time
+- gpmc,wr-cycle-ns:    Total write cycle time
  - reg:                 Chip-select, base address (relative to chip-select)
                         and size of the memory mapped for the device.
                         Note that base address will be typically 0 as this
@@ -65,24 +65,24 @@ gpmc: gpmc@6e000000 {
                 bank-width = <2>;
  
                 gpmc,mux-add-data;
-               gpmc,cs-on = <0>;
-               gpmc,cs-rd-off = <186>;
-               gpmc,cs-wr-off = <186>;
-               gpmc,adv-on = <12>;
-               gpmc,adv-rd-off = <48>;
-               gpmc,adv-wr-off = <48>;
-               gpmc,oe-on = <54>;
-               gpmc,oe-off = <168>;
-               gpmc,we-on = <54>;
-               gpmc,we-off = <168>;
-               gpmc,rd-cycle = <186>;
-               gpmc,wr-cycle = <186>;
-               gpmc,access = <114>;
-               gpmc,page-burst-access = <6>;
-               gpmc,bus-turnaround = <12>;
-               gpmc,cycle2cycle-delay = <18>;
-               gpmc,wr-data-mux-bus = <90>;
-               gpmc,wr-access = <186>;
+               gpmc,cs-on-ns = <0>;
+               gpmc,cs-rd-off-ns = <186>;
+               gpmc,cs-wr-off-ns = <186>;
+               gpmc,adv-on-ns = <12>;
+               gpmc,adv-rd-off-ns = <48>;
+               gpmc,adv-wr-off-ns = <48>;
+               gpmc,oe-on-ns = <54>;
+               gpmc,oe-off-ns = <168>;
+               gpmc,we-on-ns = <54>;
+               gpmc,we-off-ns = <168>;
+               gpmc,rd-cycle-ns = <186>;
+               gpmc,wr-cycle-ns = <186>;
+               gpmc,access-ns = <114>;
+               gpmc,page-burst-access-ns = <6>;
+               gpmc,bus-turnaround-ns = <12>;
+               gpmc,cycle2cycle-delay-ns = <18>;
+               gpmc,wr-data-mux-bus-ns = <90>;
+               gpmc,wr-access-ns = <186>;
                 gpmc,cycle2cycle-samecsen;
                 gpmc,cycle2cycle-diffcsen;
  
diff --git a/Documentation/devicetree/bindings/thermal/armada-thermal.txt b/Documentation/devicetree/bindings/thermal/armada-thermal.txt

new file mode 100644 (file)

index 0000000..fff93d5
--- /dev/null
+++ b/Documentation/devicetree/bindings/thermal/armada-thermal.txt
@@ -0,0 +1,22 @@
+* Marvell Armada 370/XP thermal management
+
+Required properties:
+
+- compatible:  Should be set to one of the following:
+               marvell,armada370-thermal
+               marvell,armadaxp-thermal
+
+- reg:         Device's register space.
+               Two entries are expected, see the examples below.
+               The first one is required for the sensor register;
+               the second one is required for the control register
+               to be used for sensor initialization (a.k.a. calibration).
+
+Example:
+
+       thermal@d0018300 {
+               compatible = "marvell,armada370-thermal";
+                reg = <0xd0018300 0x4
+                      0xd0018304 0x4>;
+               status = "okay";
+       };
diff --git a/Documentation/dmatest.txt b/Documentation/dmatest.txt

new file mode 100644 (file)

index 0000000..279ac0a
--- /dev/null
+++ b/Documentation/dmatest.txt
@@ -0,0 +1,81 @@
+                               DMA Test Guide
+                               ==============
+
+               Andy Shevchenko <andriy.shevchenko@linux.intel.com>
+
+This small document introduces how to test DMA drivers using dmatest module.
+
+       Part 1 - How to build the test module
+
+The menuconfig contains an option that could be found by following path:
+       Device Drivers -> DMA Engine support -> DMA Test client
+
+In the configuration file the option called CONFIG_DMATEST. The dmatest could
+be built as module or inside kernel. Let's consider those cases.
+
+       Part 2 - When dmatest is built as a module...
+
+After mounting debugfs and loading the module, the /sys/kernel/debug/dmatest
+folder with nodes will be created. They are the same as module parameters with
+addition of the 'run' node that controls run and stop phases of the test.
+
+Note that in this case test will not run on load automatically.
+
+Example of usage:
+       % echo dma0chan0 > /sys/kernel/debug/dmatest/channel
+       % echo 2000 > /sys/kernel/debug/dmatest/timeout
+       % echo 1 > /sys/kernel/debug/dmatest/iterations
+       % echo 1 > /sys/kernel/debug/dmatest/run
+
+Hint: available channel list could be extracted by running the following
+command:
+       % ls -1 /sys/class/dma/
+
+After a while you will start to get messages about current status or error like
+in the original code.
+
+Note that running a new test will stop any in progress test.
+
+The following command should return actual state of the test.
+       % cat /sys/kernel/debug/dmatest/run
+
+To wait for test done the user may perform a busy loop that checks the state.
+
+       % while [ $(cat /sys/kernel/debug/dmatest/run) = "Y" ]
+       > do
+       >       echo -n "."
+       >       sleep 1
+       > done
+       > echo
+
+       Part 3 - When built-in in the kernel...
+
+The module parameters that is supplied to the kernel command line will be used
+for the first performed test. After user gets a control, the test could be
+interrupted or re-run with same or different parameters. For the details see
+the above section "Part 2 - When dmatest is built as a module..."
+
+In both cases the module parameters are used as initial values for the test case.
+You always could check them at run-time by running
+       % grep -H . /sys/module/dmatest/parameters/*
+
+       Part 4 - Gathering the test results
+
+The module provides a storage for the test results in the memory. The gathered
+data could be used after test is done.
+
+The special file 'results' in the debugfs represents gathered data of the in
+progress test. The messages collected are printed to the kernel log as well.
+
+Example of output:
+       % cat /sys/kernel/debug/dmatest/results
+       dma0chan0-copy0: #1: No errors with src_off=0x7bf dst_off=0x8ad len=0x3fea (0)
+
+The message format is unified across the different types of errors. A number in
+the parens represents additional information, e.g. error code, error counter,
+or status.
+
+Comparison between buffers is stored to the dedicated structure.
+
+Note that the verify result is now accessible only via file 'results' in the
+debugfs.
diff --git a/Documentation/filesystems/f2fs.txt b/Documentation/filesystems/f2fs.txt

index dcf338e62b71108a3a02e0198edf0eb564930121..bd3c56c67380b1cf27d3ef82f59747e76a2ad07f 100644 (file)
--- a/Documentation/filesystems/f2fs.txt
+++ b/Documentation/filesystems/f2fs.txt
@@ -146,7 +146,7 @@ USAGE
  
  Format options
  --------------
--l [label]   : Give a volume label, up to 256 unicode name.
+-l [label]   : Give a volume label, up to 512 unicode name.
  -a [0 or 1]  : Split start location of each area for heap-based allocation.
                 1 is set by default, which performs this.
  -o [int]     : Set overprovision ratio in percent over volume size.
@@ -156,6 +156,8 @@ Format options
  -z [int]     : Set the number of sections per zone.
                 1 is set by default.
  -e [str]     : Set basic extension list. e.g. "mp3,gif,mov"
+-t [0 or 1]  : Disable discard command or not.
+               1 is set by default, which conducts discard.
  
  ================================================================================
  DESIGN
diff --git a/Documentation/gpio.txt b/Documentation/gpio.txt

index 77a1d11af723282625dd3c3f326a5b6576fb437c..6f83fa965b4b6e6975fda4459c72f9711ea73159 100644 (file)
--- a/Documentation/gpio.txt
+++ b/Documentation/gpio.txt
@@ -72,11 +72,11 @@ in this document, but drivers acting as clients to the GPIO interface must
  not care how it's implemented.)
  
  That said, if the convention is supported on their platform, drivers should
-use it when possible.  Platforms must declare GENERIC_GPIO support in their
-Kconfig (boolean true), and provide an <asm/gpio.h> file.  Drivers that can't
-work without standard GPIO calls should have Kconfig entries which depend
-on GENERIC_GPIO.  The GPIO calls are available, either as "real code" or as
-optimized-away stubs, when drivers use the include file:
+use it when possible.  Platforms must select ARCH_REQUIRE_GPIOLIB or
+ARCH_WANT_OPTIONAL_GPIOLIB in their Kconfig.  Drivers that can't work without
+standard GPIO calls should have Kconfig entries which depend on GPIOLIB.  The
+GPIO calls are available, either as "real code" or as optimized-away stubs,
+when drivers use the include file:
  
         #include <linux/gpio.h>
  
diff --git a/Documentation/thermal/exynos_thermal_emulation b/Documentation/thermal/exynos_thermal_emulation

index b73bbfb697bb233a92d409c53651e0f80acbeb9b..36a3e79c120354bb5c19d57919b814809a4e5026 100644 (file)
--- a/Documentation/thermal/exynos_thermal_emulation
+++ b/Documentation/thermal/exynos_thermal_emulation
@@ -13,11 +13,11 @@ Thermal emulation mode supports software debug for TMU's operation. User can set
  manually with software code and TMU will read current temperature from user value not from
  sensor's value.
  
-Enabling CONFIG_EXYNOS_THERMAL_EMUL option will make this support in available.
-When it's enabled, sysfs node will be created under
-/sys/bus/platform/devices/'exynos device name'/ with name of 'emulation'.
+Enabling CONFIG_THERMAL_EMULATION option will make this support available.
+When it's enabled, sysfs node will be created as
+/sys/devices/virtual/thermal/thermal_zone'zone id'/emul_temp.
  
-The sysfs node, 'emulation', will contain value 0 for the initial state. When you input any
+The sysfs node, 'emul_node', will contain value 0 for the initial state. When you input any
  temperature you want to update to sysfs node, it automatically enable emulation mode and
  current temperature will be changed into it.
  (Exynos also supports user changable delay time which would be used to delay of
diff --git a/Documentation/thermal/sysfs-api.txt b/Documentation/thermal/sysfs-api.txt

index 6859661c9d31be090a78e5a2c35a9a5fdad7ca80..a71bd5b90fe89ad68cc01d93e655bc7b79d69467 100644 (file)
--- a/Documentation/thermal/sysfs-api.txt
+++ b/Documentation/thermal/sysfs-api.txt
@@ -31,15 +31,17 @@ temperature) and throttle appropriate devices.
  1. thermal sysfs driver interface functions
  
  1.1 thermal zone device interface
-1.1.1 struct thermal_zone_device *thermal_zone_device_register(char *name,
+1.1.1 struct thermal_zone_device *thermal_zone_device_register(char *type,
                 int trips, int mask, void *devdata,
-               struct thermal_zone_device_ops *ops)
+               struct thermal_zone_device_ops *ops,
+               const struct thermal_zone_params *tzp,
+               int passive_delay, int polling_delay))
  
      This interface function adds a new thermal zone device (sensor) to
      /sys/class/thermal folder as thermal_zone[0-*]. It tries to bind all the
      thermal cooling devices registered at the same time.
  
-    name: the thermal zone name.
+    type: the thermal zone type.
      trips: the total number of trip points this thermal zone supports.
      mask: Bit string: If 'n'th bit is set, then trip point 'n' is writeable.
      devdata: device private data
@@ -57,6 +59,12 @@ temperature) and throttle appropriate devices.
                         will be fired.
         .set_emul_temp: set the emulation temperature which helps in debugging
                         different threshold temperature points.
+    tzp: thermal zone platform parameters.
+    passive_delay: number of milliseconds to wait between polls when
+       performing passive cooling.
+    polling_delay: number of milliseconds to wait between polls when checking
+       whether trip points have been crossed (0 for interrupt driven systems).
+
  
  1.1.2 void thermal_zone_device_unregister(struct thermal_zone_device *tz)
  
@@ -265,6 +273,10 @@ emul_temp
         Unit: millidegree Celsius
         WO, Optional
  
+         WARNING: Be careful while enabling this option on production systems,
+         because userland can easily disable the thermal policy by simply
+         flooding this sysfs node with low temperature values.
+
  *****************************
  * Cooling device attributes *
  *****************************
@@ -363,7 +375,7 @@ This function returns the thermal_instance corresponding to a given
  {thermal_zone, cooling_device, trip_point} combination. Returns NULL
  if such an instance does not exist.
  
-5.3:notify_thermal_framework:
+5.3:thermal_notify_framework:
  This function handles the trip events from sensor drivers. It starts
  throttling the cooling devices according to the policy configured.
  For CRITICAL and HOT trip points, this notifies the respective drivers,
@@ -375,11 +387,3 @@ platform data is provided, this uses the step_wise throttling policy.
  This function serves as an arbitrator to set the state of a cooling
  device. It sets the cooling device to the deepest cooling state if
  possible.
-
-5.5:thermal_register_governor:
-This function lets the various thermal governors to register themselves
-with the Thermal framework. At run time, depending on a zone's platform
-data, a particular governor is used for throttling.
-
-5.6:thermal_unregister_governor:
-This function unregisters a governor from the thermal framework.
diff --git a/Documentation/zh_CN/gpio.txt b/Documentation/zh_CN/gpio.txt

index 4fa7b4e6f856ab163f7403b8c0a3aceb57ce9764..d5b8f01833f41365689bc78258770eecebd46f36 100644 (file)
--- a/Documentation/zh_CN/gpio.txt
+++ b/Documentation/zh_CN/gpio.txt
@@ -84,10 +84,10 @@ GPIO 公约
  控制器的抽象函数来实现它。(有一些可选的代码能支持这种策略的实现,本文档
  后面会介绍，但作为 GPIO 接口的客户端驱动程序必须与它的实现无关。)
  
-也就是说,如果在他们的平台上支持这个公约，驱动应尽可能的使用它。平台
-必须在 Kconfig 中声明对 GENERIC_GPIO的支持 (布尔型 true)，并提供
-一个 <asm/gpio.h> 文件。那些调用标准 GPIO 函数的驱动应该在 Kconfig
-å\85¥å\8f£ä¸å£°æ\98\8eä¾\9dèµ\96GENERIC_GPIOã\80\82å½\93é©±å\8a¨å\8c\85å\90«æ\96\87ä»¶:
+ä¹\9få°±æ\98¯è¯´,å¦\82æ\9e\9cå\9c¨ä»\96ä»¬ç\9a\84å¹³å\8f°ä¸\8aæ\94¯æ\8c\81è¿\99ä¸ªå\85¬çº¦ï¼\8cé©±å\8a¨åº\94å°½å\8f¯è\83½ç\9a\84ä½¿ç\94¨å®\83ã\80\82å\90\8cæ\97¶ï¼\8cå¹³å\8f°
+必须在 Kconfig 中选择 ARCH_REQUIRE_GPIOLIB 或者 ARCH_WANT_OPTIONAL_GPIOLIB
+选项。那些调用标准 GPIO 函数的驱动应该在 Kconfig 入口中声明依赖GENERIC_GPIO。
+当驱动包含文件:
  
         #include <linux/gpio.h>
  
diff --git a/MAINTAINERS b/MAINTAINERS

index e1f5fac1838efde0cb54741d97bdab8ac1469dda..3d7782b9f90d80d171c97e1d15371a9e2ec51ddc 100644 (file)
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -1620,6 +1620,13 @@ W:       http://www.baycom.org/~tom/ham/ham.html
  S:     Maintained
  F:     drivers/net/hamradio/baycom*
  
+BCACHE (BLOCK LAYER CACHE)
+M:     Kent Overstreet <koverstreet@google.com>
+L:     linux-bcache@vger.kernel.org
+W:     http://bcache.evilpiepirate.org
+S:     Maintained:
+F:     drivers/md/bcache/
+
  BEFS FILE SYSTEM
  S:     Orphan
  F:     Documentation/filesystems/befs.txt
@@ -6716,6 +6723,14 @@ F:       drivers/remoteproc/
  F:     Documentation/remoteproc.txt
  F:     include/linux/remoteproc.h
  
+REMOTE PROCESSOR MESSAGING (RPMSG) SUBSYSTEM
+M:     Ohad Ben-Cohen <ohad@wizery.com>
+T:     git git://git.kernel.org/pub/scm/linux/kernel/git/ohad/rpmsg.git
+S:     Maintained
+F:     drivers/rpmsg/
+F:     Documentation/rpmsg.txt
+F:     include/linux/rpmsg.h
+
  RFKILL
  M:     Johannes Berg <johannes@sipsolutions.net>
  L:     linux-wireless@vger.kernel.org
@@ -7140,9 +7155,9 @@ F:        drivers/misc/phantom.c
  F:     include/uapi/linux/phantom.h
  
  SERIAL ATA (SATA) SUBSYSTEM
-M:     Jeff Garzik <jgarzik@pobox.com>
+M:     Tejun Heo <tj@kernel.org>
  L:     linux-ide@vger.kernel.org
-T:     git git://git.kernel.org/pub/scm/linux/kernel/git/jgarzik/libata-dev.git
+T:     git git://git.kernel.org/pub/scm/linux/kernel/git/tj/libata.git
  S:     Supported
  F:     drivers/ata/
  F:     include/linux/ata.h
@@ -8014,11 +8029,14 @@ F:      arch/xtensa/
  
  THERMAL
  M:      Zhang Rui <rui.zhang@intel.com>
+M:      Eduardo Valentin <eduardo.valentin@ti.com>
  L:      linux-pm@vger.kernel.org
  T:      git git://git.kernel.org/pub/scm/linux/kernel/git/rzhang/linux.git
+Q:      https://patchwork.kernel.org/project/linux-pm/list/
  S:      Supported
  F:      drivers/thermal/
  F:      include/linux/thermal.h
+F:      include/linux/cpu_cooling.h
  
  THINGM BLINK(1) USB RGB LED DRIVER
  M:     Vivien Didelot <vivien.didelot@savoirfairelinux.com>
diff --git a/arch/alpha/Kconfig b/arch/alpha/Kconfig

index 8629127640cff0779bdaebd87d4fadd32799e04e..837a1f2d8b965a60b61252521f21129be9e80afe 100644 (file)
--- a/arch/alpha/Kconfig
+++ b/arch/alpha/Kconfig
@@ -55,9 +55,6 @@ config GENERIC_CALIBRATE_DELAY
         bool
         default y
  
-config GENERIC_GPIO
-       bool
-
  config ZONE_DMA
         bool
         default y
diff --git a/arch/arm/Kconfig b/arch/arm/Kconfig

index 34ef016626ff401211964897e5c75ec9f012583b..d423d58f938dc40fb5b3c01445b9184572631b10 100644 (file)
--- a/arch/arm/Kconfig
+++ b/arch/arm/Kconfig
@@ -109,9 +109,6 @@ config MIGHT_HAVE_PCI
  config SYS_SUPPORTS_APM_EMULATION
         bool
  
-config GENERIC_GPIO
-       bool
-
  config HAVE_TCM
         bool
         select GENERIC_ALLOCATOR
@@ -900,7 +897,6 @@ config ARCH_MULTI_V7
         bool "ARMv7 based platforms (Cortex-A, PJ4, Scorpion, Krait)"
         default y
         select ARCH_MULTI_V6_V7
-       select ARCH_VEXPRESS
         select CPU_V7
  
  config ARCH_MULTI_V6_V7
@@ -1794,6 +1790,7 @@ config XEN
         depends on ARM && AEABI && OF
         depends on CPU_V7 && !CPU_V6
         depends on !GENERIC_ATOMIC64
+       select ARM_PSCI
         help
           Say Y if you want to run Linux in a Virtual Machine on Xen on ARM.
  
diff --git a/arch/arm/boot/dts/Makefile b/arch/arm/boot/dts/Makefile

index 8562af4fe8fd46e4639c275120ce09129efac733..b9f7121e6ecf02c561e5b1b10308659aad23fb28 100644 (file)
--- a/arch/arm/boot/dts/Makefile
+++ b/arch/arm/boot/dts/Makefile
@@ -200,8 +200,8 @@ dtb-$(CONFIG_ARCH_VERSATILE) += versatile-ab.dtb \
  dtb-$(CONFIG_ARCH_VEXPRESS) += vexpress-v2p-ca5s.dtb \
         vexpress-v2p-ca9.dtb \
         vexpress-v2p-ca15-tc1.dtb \
-       vexpress-v2p-ca15_a7.dtb \
-       xenvm-4.2.dtb
+       vexpress-v2p-ca15_a7.dtb
+dtb-$(CONFIG_ARCH_VIRT) += xenvm-4.2.dtb
  dtb-$(CONFIG_ARCH_VT8500) += vt8500-bv07.dtb \
         wm8505-ref.dtb \
         wm8650-mid.dtb \
diff --git a/arch/arm/boot/dts/cros5250-common.dtsi b/arch/arm/boot/dts/cros5250-common.dtsi

index 0a61bbb9102fa712fee3c4b04d96efd240f99a86..3f0239ec1bc5907c4cd2ccfc555e01fe91ff9582 100644 (file)
--- a/arch/arm/boot/dts/cros5250-common.dtsi
+++ b/arch/arm/boot/dts/cros5250-common.dtsi
@@ -175,6 +175,14 @@
         i2c@12C70000 {
                 samsung,i2c-sda-delay = <100>;
                 samsung,i2c-max-bus-freq = <378000>;
+
+               trackpad {
+                       reg = <0x67>;
+                       compatible = "cypress,cyapa";
+                       interrupts = <2 0>;
+                       interrupt-parent = <&gpx1>;
+                       wakeup-source;
+               };
         };
  
         i2c@12C80000 {
diff --git a/arch/arm/boot/dts/xenvm-4.2.dts b/arch/arm/boot/dts/xenvm-4.2.dts

index ec3f9528e180c75e42b633104fb537ced0762691..336915151398d81a44d8f1521cf844ade6961237 100644 (file)
--- a/arch/arm/boot/dts/xenvm-4.2.dts
+++ b/arch/arm/boot/dts/xenvm-4.2.dts
@@ -29,6 +29,19 @@
                         compatible = "arm,cortex-a15";
                         reg = <0>;
                 };
+
+               cpu@1 {
+                       device_type = "cpu";
+                       compatible = "arm,cortex-a15";
+                       reg = <1>;
+               };
+       };
+
+       psci {
+               compatible      = "arm,psci";
+               method          = "hvc";
+               cpu_off         = <1>;
+               cpu_on          = <2>;
         };
  
         memory@80000000 {
diff --git a/arch/arm/configs/omap2plus_defconfig b/arch/arm/configs/omap2plus_defconfig

index 33903ca0d8798fa6c2449b7d357397c6285c3288..c1ef64bc5abd65781da53de52d4cfd3e392e29cc 100644 (file)
--- a/arch/arm/configs/omap2plus_defconfig
+++ b/arch/arm/configs/omap2plus_defconfig
@@ -137,6 +137,8 @@ CONFIG_SERIAL_8250_DETECT_IRQ=y
  CONFIG_SERIAL_8250_RSA=y
  CONFIG_SERIAL_AMBA_PL011=y
  CONFIG_SERIAL_AMBA_PL011_CONSOLE=y
+CONFIG_SERIAL_OMAP=y
+CONFIG_SERIAL_OMAP_CONSOLE=y
  CONFIG_HW_RANDOM=y
  CONFIG_I2C_CHARDEV=y
  CONFIG_SPI=y
@@ -153,6 +155,7 @@ CONFIG_OMAP_WATCHDOG=y
  CONFIG_TWL4030_WATCHDOG=y
  CONFIG_MFD_TPS65217=y
  CONFIG_MFD_TPS65910=y
+CONFIG_TWL6040_CORE=y
  CONFIG_REGULATOR_TWL4030=y
  CONFIG_REGULATOR_TPS65023=y
  CONFIG_REGULATOR_TPS6507X=y
@@ -195,6 +198,7 @@ CONFIG_SND_USB_AUDIO=m
  CONFIG_SND_SOC=m
  CONFIG_SND_OMAP_SOC=m
  CONFIG_SND_OMAP_SOC_OMAP_TWL4030=m
+CONFIG_SND_OMAP_SOC_OMAP_ABE_TWL6040=m
  CONFIG_SND_OMAP_SOC_OMAP3_PANDORA=m
  CONFIG_USB=y
  CONFIG_USB_DEBUG=y
diff --git a/arch/arm/include/asm/xen/hypercall.h b/arch/arm/include/asm/xen/hypercall.h

index 8a823253d775d83337bab6e833d30949e553e49d..799f42ecca63309d485090add41c495cc4f4ccea 100644 (file)
--- a/arch/arm/include/asm/xen/hypercall.h
+++ b/arch/arm/include/asm/xen/hypercall.h
@@ -46,6 +46,7 @@ int HYPERVISOR_event_channel_op(int cmd, void *arg);
  unsigned long HYPERVISOR_hvm_op(int op, void *arg);
  int HYPERVISOR_memory_op(unsigned int cmd, void *arg);
  int HYPERVISOR_physdev_op(int cmd, void *arg);
+int HYPERVISOR_vcpu_op(int cmd, int vcpuid, void *extra_args);
  
  static inline void
  MULTI_update_va_mapping(struct multicall_entry *mcl, unsigned long va,
diff --git a/arch/arm/kernel/devtree.c b/arch/arm/kernel/devtree.c

index 70f1bdeb241b1d4683d15025d3c7243cd69683d5..5af04f6daa33804ac8a3d03b7765e7bae4a86106 100644 (file)
--- a/arch/arm/kernel/devtree.c
+++ b/arch/arm/kernel/devtree.c
@@ -180,6 +180,13 @@ struct machine_desc * __init setup_machine_fdt(unsigned int dt_phys)
         unsigned long dt_root;
         const char *model;
  
+#ifdef CONFIG_ARCH_MULTIPLATFORM
+       DT_MACHINE_START(GENERIC_DT, "Generic DT based system")
+       MACHINE_END
+
+       mdesc_best = (struct machine_desc *)&__mach_desc_GENERIC_DT;
+#endif
+
         if (!dt_phys)
                 return NULL;
  
diff --git a/arch/arm/kernel/setup.c b/arch/arm/kernel/setup.c

index 728007c4a2b7eaf18e038d93a8075699d7a479dc..1522c7ae31b0c239901237569bad5cbb4ec7b02e 100644 (file)
--- a/arch/arm/kernel/setup.c
+++ b/arch/arm/kernel/setup.c
@@ -18,6 +18,7 @@
  #include <linux/bootmem.h>
  #include <linux/seq_file.h>
  #include <linux/screen_info.h>
+#include <linux/of_platform.h>
  #include <linux/init.h>
  #include <linux/kexec.h>
  #include <linux/of_fdt.h>
@@ -659,9 +660,19 @@ struct screen_info screen_info = {
  
  static int __init customize_machine(void)
  {
-       /* customizes platform devices, or adds new ones */
+       /*
+        * customizes platform devices, or adds new ones
+        * On DT based machines, we fall back to populating the
+        * machine from the device tree, if no callback is provided,
+        * otherwise we would always need an init_machine callback.
+        */
         if (machine_desc->init_machine)
                 machine_desc->init_machine();
+#ifdef CONFIG_OF
+       else
+               of_platform_populate(NULL, of_default_bus_match_table,
+                                       NULL, NULL);
+#endif
         return 0;
  }
  arch_initcall(customize_machine);
diff --git a/arch/arm/mach-imx/Kconfig b/arch/arm/mach-imx/Kconfig

index 78f795d73cb64eeae161bda1ea5917aa3bf8fe3d..ba44328464f37c0cdce9eb9039fcfce8ad8dc743 100644 (file)
--- a/arch/arm/mach-imx/Kconfig
+++ b/arch/arm/mach-imx/Kconfig
@@ -5,6 +5,7 @@ config ARCH_MXC
         select AUTO_ZRELADDR if !ZBOOT_ROM
         select CLKDEV_LOOKUP
         select CLKSRC_MMIO
+       select GENERIC_ALLOCATOR
         select GENERIC_CLOCKEVENTS
         select GENERIC_IRQ_CHIP
         select MULTI_IRQ_HANDLER
@@ -61,10 +62,6 @@ config MXC_ULPI
  config ARCH_HAS_RNGA
         bool
  
-config IRAM_ALLOC
-       bool
-       select GENERIC_ALLOCATOR
-
  config HAVE_IMX_ANATOP
         bool
  
diff --git a/arch/arm/mach-imx/Makefile b/arch/arm/mach-imx/Makefile

index 930958973f81f14fce5ca270bcb3667b5f660ffa..70ae7c490ac0428ce637e7921a26203ede04e1ac 100644 (file)
--- a/arch/arm/mach-imx/Makefile
+++ b/arch/arm/mach-imx/Makefile
@@ -23,7 +23,6 @@ obj-$(CONFIG_ARCH_MXC_IOMUX_V3) += iomux-v3.o
  obj-$(CONFIG_MXC_TZIC) += tzic.o
  obj-$(CONFIG_MXC_AVIC) += avic.o
  
-obj-$(CONFIG_IRAM_ALLOC) += iram_alloc.o
  obj-$(CONFIG_MXC_ULPI) += ulpi.o
  obj-$(CONFIG_MXC_USE_EPIT) += epit.o
  obj-$(CONFIG_MXC_DEBUG_BOARD) += 3ds_debugboard.o
diff --git a/arch/arm/mach-imx/common.h b/arch/arm/mach-imx/common.h

index 4cba7dbb079fce445fca9afbd8d9f18ce11b56ae..c08ae3f99cee0e98f0025a0840a430407406e1b1 100644 (file)
--- a/arch/arm/mach-imx/common.h
+++ b/arch/arm/mach-imx/common.h
@@ -12,6 +12,7 @@
  #define __ASM_ARCH_MXC_COMMON_H__
  
  struct platform_device;
+struct pt_regs;
  struct clk;
  enum mxc_cpu_pwr_mode;
  
diff --git a/arch/arm/mach-imx/headsmp.S b/arch/arm/mach-imx/headsmp.S

index a58c8b0527ccb3aad1f542b83a159c868a61655d..67b9c48dcafe7dc1ee281c49ece9de88338297f4 100644 (file)
--- a/arch/arm/mach-imx/headsmp.S
+++ b/arch/arm/mach-imx/headsmp.S
@@ -24,7 +24,7 @@ ENTRY(v7_secondary_startup)
  ENDPROC(v7_secondary_startup)
  #endif
  
-#ifdef CONFIG_PM
+#ifdef CONFIG_ARM_CPU_SUSPEND
  /*
   * The following code must assume it is running from physical address
   * where absolute virtual addresses to the data section have to be
diff --git a/arch/arm/mach-imx/hotplug.c b/arch/arm/mach-imx/hotplug.c

index 5e91112dcbee8eddcb32ce228f82e3e8f6ab6896..3daf1ed90579a74a97f4d25f4292d8d09e50a1f5 100644 (file)
--- a/arch/arm/mach-imx/hotplug.c
+++ b/arch/arm/mach-imx/hotplug.c
@@ -11,7 +11,9 @@
   */
  
  #include <linux/errno.h>
+#include <linux/jiffies.h>
  #include <asm/cp15.h>
+#include <asm/proc-fns.h>
  
  #include "common.h"
  
diff --git a/arch/arm/mach-imx/iram_alloc.c b/arch/arm/mach-imx/iram_alloc.c

deleted file mode 100644 (file)

index e05cf40..0000000
--- a/arch/arm/mach-imx/iram_alloc.c
+++ /dev/null
@@ -1,73 +0,0 @@
-/*
- * Copyright (C) 2010 Freescale Semiconductor, Inc. All Rights Reserved.
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public License
- * as published by the Free Software Foundation; either version 2
- * of the License, or (at your option) any later version.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, write to the Free Software
- * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston,
- * MA 02110-1301, USA.
- */
-
-#include <linux/kernel.h>
-#include <linux/io.h>
-#include <linux/module.h>
-#include <linux/spinlock.h>
-#include <linux/genalloc.h>
-#include "linux/platform_data/imx-iram.h"
-
-static unsigned long iram_phys_base;
-static void __iomem *iram_virt_base;
-static struct gen_pool *iram_pool;
-
-static inline void __iomem *iram_phys_to_virt(unsigned long p)
-{
-       return iram_virt_base + (p - iram_phys_base);
-}
-
-void __iomem *iram_alloc(unsigned int size, unsigned long *dma_addr)
-{
-       if (!iram_pool)
-               return NULL;
-
-       *dma_addr = gen_pool_alloc(iram_pool, size);
-       pr_debug("iram alloc - %dB@0x%lX\n", size, *dma_addr);
-       if (!*dma_addr)
-               return NULL;
-       return iram_phys_to_virt(*dma_addr);
-}
-EXPORT_SYMBOL(iram_alloc);
-
-void iram_free(unsigned long addr, unsigned int size)
-{
-       if (!iram_pool)
-               return;
-
-       gen_pool_free(iram_pool, addr, size);
-}
-EXPORT_SYMBOL(iram_free);
-
-int __init iram_init(unsigned long base, unsigned long size)
-{
-       iram_phys_base = base;
-
-       iram_pool = gen_pool_create(PAGE_SHIFT, -1);
-       if (!iram_pool)
-               return -ENOMEM;
-
-       gen_pool_add(iram_pool, base, size, -1);
-       iram_virt_base = ioremap(iram_phys_base, size);
-       if (!iram_virt_base)
-               return -EIO;
-
-       pr_debug("i.MX IRAM pool: %ld KB@0x%p\n", size / 1024, iram_virt_base);
-       return 0;
-}
diff --git a/arch/arm/mach-msm/last_radio_log.c b/arch/arm/mach-msm/last_radio_log.c

index 7777767ee89ae0aecf390cf4396574e76b02517b..9c392a29fc7e89a2802aedbaf1aca81e0ff846e6 100644 (file)
--- a/arch/arm/mach-msm/last_radio_log.c
+++ b/arch/arm/mach-msm/last_radio_log.c
@@ -66,6 +66,6 @@ void msm_init_last_radio_log(struct module *owner)
         pr_err("%s: last radio log is %d bytes long\n", __func__,
                 radio_log_size);
         last_radio_log_fops.owner = owner;
-       entry->size = radio_log_size;
+       proc_set_size(entry, radio_log_size);
  }
  EXPORT_SYMBOL(msm_init_last_radio_log);
diff --git a/arch/arm/mach-omap1/dma.c b/arch/arm/mach-omap1/dma.c

index 1a4e887f028d59b9e4c84584a5fc5418a4693e41..68ab858e27b754bea93264889ab9b020cf67d698 100644 (file)
--- a/arch/arm/mach-omap1/dma.c
+++ b/arch/arm/mach-omap1/dma.c
@@ -301,7 +301,7 @@ static int __init omap1_system_dma_init(void)
         if (ret) {
                 dev_err(&pdev->dev, "%s: Unable to add resources for %s%d\n",
                         __func__, pdev->name, pdev->id);
-               goto exit_device_put;
+               goto exit_iounmap;
         }
  
         p = kzalloc(sizeof(struct omap_system_dma_plat_info), GFP_KERNEL);
@@ -309,7 +309,7 @@ static int __init omap1_system_dma_init(void)
                 dev_err(&pdev->dev, "%s: Unable to allocate 'p' for %s\n",
                         __func__, pdev->name);
                 ret = -ENOMEM;
-               goto exit_device_del;
+               goto exit_iounmap;
         }
  
         d = kzalloc(sizeof(struct omap_dma_dev_attr), GFP_KERNEL);
@@ -402,8 +402,8 @@ exit_release_d:
         kfree(d);
  exit_release_p:
         kfree(p);
-exit_device_del:
-       platform_device_del(pdev);
+exit_iounmap:
+       iounmap(dma_base);
  exit_device_put:
         platform_device_put(pdev);
  
diff --git a/arch/arm/mach-omap2/Kconfig b/arch/arm/mach-omap2/Kconfig

index 857b1f097fd80ac6c6e59f9406630bbea2a59174..f49cd51e162afcc6055d9143ced540d8fa888ab5 100644 (file)
--- a/arch/arm/mach-omap2/Kconfig
+++ b/arch/arm/mach-omap2/Kconfig
@@ -37,8 +37,6 @@ config ARCH_OMAP2PLUS_TYPICAL
         select NEON if ARCH_OMAP3 || ARCH_OMAP4 || SOC_OMAP5
         select PM_RUNTIME
         select REGULATOR
-       select SERIAL_OMAP
-       select SERIAL_OMAP_CONSOLE
         select TWL4030_CORE if ARCH_OMAP3 || ARCH_OMAP4
         select TWL4030_POWER if ARCH_OMAP3 || ARCH_OMAP4
         select VFP
diff --git a/arch/arm/mach-omap2/Makefile b/arch/arm/mach-omap2/Makefile

index 62bb352c2d37fd80b2793aa3d8d744ff9eec1195..55a9d677768328d552cec82e6229402a5696a8b2 100644 (file)
--- a/arch/arm/mach-omap2/Makefile
+++ b/arch/arm/mach-omap2/Makefile
@@ -32,12 +32,12 @@ obj-$(CONFIG_SOC_HAS_OMAP2_SDRC)    += sdrc.o
  
  # SMP support ONLY available for OMAP4
  
-obj-$(CONFIG_SMP)                      += omap-smp.o omap-headsmp.o
-obj-$(CONFIG_HOTPLUG_CPU)              += omap-hotplug.o
+smp-$(CONFIG_SMP)                      += omap-smp.o omap-headsmp.o
+smp-$(CONFIG_HOTPLUG_CPU)              += omap-hotplug.o
  omap-4-5-common                                =  omap4-common.o omap-wakeupgen.o \
                                            sleep44xx.o
-obj-$(CONFIG_ARCH_OMAP4)               += $(omap-4-5-common)
-obj-$(CONFIG_SOC_OMAP5)                        += $(omap-4-5-common)
+obj-$(CONFIG_ARCH_OMAP4)               += $(omap-4-5-common) $(smp-y)
+obj-$(CONFIG_SOC_OMAP5)                        += $(omap-4-5-common) $(smp-y)
  
  plus_sec := $(call as-instr,.arch_extension sec,+sec)
  AFLAGS_omap-headsmp.o                  :=-Wa,-march=armv7-a$(plus_sec)
diff --git a/arch/arm/mach-omap2/board-omap3beagle.c b/arch/arm/mach-omap2/board-omap3beagle.c

index 6de78605c0afa75b87b5ca52abcd83e04d0ec1ce..04c1165554125553b29fc1292404c1cc922ed965 100644 (file)
--- a/arch/arm/mach-omap2/board-omap3beagle.c
+++ b/arch/arm/mach-omap2/board-omap3beagle.c
@@ -112,13 +112,13 @@ static u8 omap3_beagle_version;
   */
  static struct {
         int mmc1_gpio_wp;
-       int usb_pwr_level;
+       bool usb_pwr_level;     /* 0 - Active Low, 1 - Active High */
         int dvi_pd_gpio;
         int usr_button_gpio;
         int mmc_caps;
  } beagle_config = {
         .mmc1_gpio_wp = -EINVAL,
-       .usb_pwr_level = GPIOF_OUT_INIT_LOW,
+       .usb_pwr_level = 0,
         .dvi_pd_gpio = -EINVAL,
         .usr_button_gpio = 4,
         .mmc_caps = MMC_CAP_4_BIT_DATA | MMC_CAP_8_BIT_DATA,
@@ -178,7 +178,7 @@ static void __init omap3_beagle_init_rev(void)
         case 0:
                 printk(KERN_INFO "OMAP3 Beagle Rev: xM Ax/Bx\n");
                 omap3_beagle_version = OMAP3BEAGLE_BOARD_XM;
-               beagle_config.usb_pwr_level = GPIOF_OUT_INIT_HIGH;
+               beagle_config.usb_pwr_level = 1;
                 beagle_config.mmc_caps &= ~MMC_CAP_8_BIT_DATA;
                 break;
         case 2:
diff --git a/arch/arm/mach-omap2/board-rx51-peripherals.c b/arch/arm/mach-omap2/board-rx51-peripherals.c

index 1a884670a6c4fe1c40d2d668129502f0bd636fce..18ca61e300b3e86dd5bd92da2788945938723290 100644 (file)
--- a/arch/arm/mach-omap2/board-rx51-peripherals.c
+++ b/arch/arm/mach-omap2/board-rx51-peripherals.c
@@ -73,11 +73,11 @@
  #define LIS302_IRQ1_GPIO 181
  #define LIS302_IRQ2_GPIO 180  /* Not yet in use */
  
-/* list all spi devices here */
+/* List all SPI devices here. Note that the list/probe order seems to matter! */
  enum {
         RX51_SPI_WL1251,
-       RX51_SPI_MIPID,         /* LCD panel */
         RX51_SPI_TSC2005,       /* Touch Controller */
+       RX51_SPI_MIPID,         /* LCD panel */
  };
  
  static struct wl12xx_platform_data wl1251_pdata;
diff --git a/arch/arm/mach-omap2/dma.c b/arch/arm/mach-omap2/dma.c

index dab9fc014b971cb7834dd518558dbb58de0996c5..49fd0d501c9bc5edcbb83bbb234891e0c381753c 100644 (file)
--- a/arch/arm/mach-omap2/dma.c
+++ b/arch/arm/mach-omap2/dma.c
@@ -28,6 +28,7 @@
  #include <linux/init.h>
  #include <linux/device.h>
  #include <linux/dma-mapping.h>
+#include <linux/of.h>
  #include <linux/omap-dma.h>
  
  #include "soc.h"
@@ -304,6 +305,9 @@ static int __init omap2_system_dma_init(void)
         if (res)
                 return res;
  
+       if (of_have_populated_dt())
+               return res;
+
         pdev = platform_device_register_full(&omap_dma_dev_info);
         if (IS_ERR(pdev))
                 return PTR_ERR(pdev);
diff --git a/arch/arm/mach-omap2/gpmc.c b/arch/arm/mach-omap2/gpmc.c

index ed946df5ad8a6acd75bb9010843eac90b3d4480c..6c4da1254f5395d31ac32fdb5a836297703208f6 100644 (file)
--- a/arch/arm/mach-omap2/gpmc.c
+++ b/arch/arm/mach-omap2/gpmc.c
@@ -1520,36 +1520,22 @@ static int gpmc_probe_dt(struct platform_device *pdev)
                 return ret;
         }
  
-       for_each_node_by_name(child, "nand") {
-               ret = gpmc_probe_nand_child(pdev, child);
-               if (ret < 0) {
-                       of_node_put(child);
-                       return ret;
-               }
-       }
+       for_each_child_of_node(pdev->dev.of_node, child) {
  
-       for_each_node_by_name(child, "onenand") {
-               ret = gpmc_probe_onenand_child(pdev, child);
-               if (ret < 0) {
-                       of_node_put(child);
-                       return ret;
-               }
-       }
+               if (!child->name)
+                       continue;
  
-       for_each_node_by_name(child, "nor") {
-               ret = gpmc_probe_generic_child(pdev, child);
-               if (ret < 0) {
-                       of_node_put(child);
-                       return ret;
-               }
-       }
+               if (of_node_cmp(child->name, "nand") == 0)
+                       ret = gpmc_probe_nand_child(pdev, child);
+               else if (of_node_cmp(child->name, "onenand") == 0)
+                       ret = gpmc_probe_onenand_child(pdev, child);
+               else if (of_node_cmp(child->name, "ethernet") == 0 ||
+                        of_node_cmp(child->name, "nor") == 0)
+                       ret = gpmc_probe_generic_child(pdev, child);
  
-       for_each_node_by_name(child, "ethernet") {
-               ret = gpmc_probe_generic_child(pdev, child);
-               if (ret < 0) {
+               if (WARN(ret < 0, "%s: probing gpmc child %s failed\n",
+                        __func__, child->full_name))
                         of_node_put(child);
-                       return ret;
-               }
         }
  
         return 0;
diff --git a/arch/arm/mach-omap2/id.c b/arch/arm/mach-omap2/id.c

index 0f4c18e6e60c686b0a82528aba55889d89586200..1272c41d474903c638e60902e66144e6a3692e79 100644 (file)
--- a/arch/arm/mach-omap2/id.c
+++ b/arch/arm/mach-omap2/id.c
@@ -419,11 +419,15 @@ void __init omap3xxx_check_revision(void)
                         cpu_rev = "1.0";
                         break;
                 case 1:
-               /* FALLTHROUGH */
-               default:
                         omap_revision = AM335X_REV_ES2_0;
                         cpu_rev = "2.0";
                         break;
+               case 2:
+               /* FALLTHROUGH */
+               default:
+                       omap_revision = AM335X_REV_ES2_1;
+                       cpu_rev = "2.1";
+                       break;
                 }
                 break;
         case 0xb8f2:
@@ -644,13 +648,12 @@ void __init omap_soc_device_init(void)
         soc_dev_attr->revision = soc_rev;
  
         soc_dev = soc_device_register(soc_dev_attr);
-       if (IS_ERR_OR_NULL(soc_dev)) {
+       if (IS_ERR(soc_dev)) {
                 kfree(soc_dev_attr);
                 return;
         }
  
         parent = soc_device_to_device(soc_dev);
-       if (!IS_ERR_OR_NULL(parent))
-               device_create_file(parent, &omap_soc_attr);
+       device_create_file(parent, &omap_soc_attr);
  }
  #endif /* CONFIG_SOC_BUS */
diff --git a/arch/arm/mach-omap2/mux34xx.h b/arch/arm/mach-omap2/mux34xx.h

index 6543ebf8ecfc1ab7698a6d73b81d414b343249ca..3f26d297c0826e59691d49e2fe0149b6b9d6e93e 100644 (file)
--- a/arch/arm/mach-omap2/mux34xx.h
+++ b/arch/arm/mach-omap2/mux34xx.h
@@ -393,6 +393,10 @@
  #define OMAP3_CONTROL_PADCONF_SAD2D_SWAKEUP_OFFSET             0xa1c
  #define OMAP3_CONTROL_PADCONF_JTAG_RTCK_OFFSET                 0xa1e
  #define OMAP3_CONTROL_PADCONF_JTAG_TDO_OFFSET                  0xa20
+#define OMAP3_CONTROL_PADCONF_GPIO_127                         0xa24
+#define OMAP3_CONTROL_PADCONF_GPIO_126                         0xa26
+#define OMAP3_CONTROL_PADCONF_GPIO_128                         0xa28
+#define OMAP3_CONTROL_PADCONF_GPIO_129                         0xa2a
  
  #define OMAP3_CONTROL_PADCONF_MUX_SIZE                         \
-               (OMAP3_CONTROL_PADCONF_JTAG_TDO_OFFSET + 0x2)
+               (OMAP3_CONTROL_PADCONF_GPIO_129 + 0x2)
diff --git a/arch/arm/mach-omap2/omap_device.c b/arch/arm/mach-omap2/omap_device.c

index eeea4fa28fbcced9e8e8908d1749493ea4057a8b..e6d230700b2bdf3d0702de3b4e9cfeef67d6801e 100644 (file)
--- a/arch/arm/mach-omap2/omap_device.c
+++ b/arch/arm/mach-omap2/omap_device.c
@@ -876,4 +876,4 @@ static int __init omap_device_late_init(void)
         bus_for_each_dev(&platform_bus_type, NULL, NULL, omap_device_late_idle);
         return 0;
  }
-omap_late_initcall(omap_device_late_init);
+omap_late_initcall_sync(omap_device_late_init);
diff --git a/arch/arm/mach-omap2/soc.h b/arch/arm/mach-omap2/soc.h

index 18fdeeb3a44a494096ae7b246a02a9604a4fdb89..197cc16870d9e7f3a223817f9870307c0f99a968 100644 (file)
--- a/arch/arm/mach-omap2/soc.h
+++ b/arch/arm/mach-omap2/soc.h
@@ -396,6 +396,7 @@ IS_OMAP_TYPE(3430, 0x3430)
  #define AM335X_CLASS           0x33500033
  #define AM335X_REV_ES1_0       AM335X_CLASS
  #define AM335X_REV_ES2_0       (AM335X_CLASS | (0x1 << 8))
+#define AM335X_REV_ES2_1       (AM335X_CLASS | (0x2 << 8))
  
  #define OMAP443X_CLASS         0x44300044
  #define OMAP4430_REV_ES1_0     (OMAP443X_CLASS | (0x10 << 8))
@@ -496,6 +497,7 @@ level(__##fn);
  #define omap_subsys_initcall(fn)       omap_initcall(subsys_initcall, fn)
  #define omap_device_initcall(fn)       omap_initcall(device_initcall, fn)
  #define omap_late_initcall(fn)         omap_initcall(late_initcall, fn)
+#define omap_late_initcall_sync(fn)    omap_initcall(late_initcall_sync, fn)
  
  #endif /* __ASSEMBLY__ */
  
diff --git a/arch/arm/mach-omap2/timer.c b/arch/arm/mach-omap2/timer.c

index 05481490a5084c26e940bda8697106bfe50372c7..f8b23b8040d9aa61a1593d30a55c5ed951cc5f6a 100644 (file)
--- a/arch/arm/mach-omap2/timer.c
+++ b/arch/arm/mach-omap2/timer.c
@@ -553,6 +553,8 @@ static inline void __init realtime_counter_init(void)
                                clksrc_nr, clksrc_src, clksrc_prop)      \
  void __init omap##name##_gptimer_timer_init(void)                      \
  {                                                                      \
+       if (omap_clk_init)                                              \
+               omap_clk_init();                                        \
         omap_dmtimer_init();                                            \
         omap2_gp_clockevent_init((clkev_nr), clkev_src, clkev_prop);    \
         omap2_gptimer_clocksource_init((clksrc_nr), clksrc_src,         \
@@ -563,6 +565,8 @@ void __init omap##name##_gptimer_timer_init(void)                   \
                                 clksrc_nr, clksrc_src, clksrc_prop)     \
  void __init omap##name##_sync32k_timer_init(void)              \
  {                                                                      \
+       if (omap_clk_init)                                              \
+               omap_clk_init();                                        \
         omap_dmtimer_init();                                            \
         omap2_gp_clockevent_init((clkev_nr), clkev_src, clkev_prop);    \
         /* Enable the use of clocksource="gp_timer" kernel parameter */ \
diff --git a/arch/arm/mach-prima2/Kconfig b/arch/arm/mach-prima2/Kconfig

index 80ca974b2f828de48e6e99ba4204662f08babacb..6988b117fc174a70e049df575417dea3093daa1b 100644 (file)
--- a/arch/arm/mach-prima2/Kconfig
+++ b/arch/arm/mach-prima2/Kconfig
@@ -38,7 +38,7 @@ config ARCH_MARCO
         select CPU_V7
         select HAVE_ARM_SCU if SMP
         select HAVE_SMP
-       select SMP_ON_UP
+       select SMP_ON_UP if SMP
         help
            Support for CSR SiRFSoC ARM Cortex A9 Platform
  
diff --git a/arch/arm/mach-pxa/Kconfig b/arch/arm/mach-pxa/Kconfig

index 9075461999c11d31d67af317e3de806666ba987f..96100dbf5a2e8353e9fad34ded41ec5e83dda7ad 100644 (file)
--- a/arch/arm/mach-pxa/Kconfig
+++ b/arch/arm/mach-pxa/Kconfig
@@ -162,7 +162,6 @@ config MACH_XCEP
         select MTD
         select MTD_CFI
         select MTD_CFI_INTELEXT
-       select MTD_CHAR
         select MTD_PHYSMAP
         select PXA25x
         select SMC91X
diff --git a/arch/arm/mach-spear/spear13xx.c b/arch/arm/mach-spear/spear13xx.c

index 3621599c38adf023826bc136b335135b19778380..7aa6e8cf830f5eb4796c25881b12221f19035e4d 100644 (file)
--- a/arch/arm/mach-spear/spear13xx.c
+++ b/arch/arm/mach-spear/spear13xx.c
@@ -35,6 +35,8 @@ void __init spear13xx_l2x0_init(void)
          * write alloc and 'Full line of zero' options
          *
          */
+       if (!IS_ENABLED(CONFIG_CACHE_L2X0))
+               return;
  
         writel_relaxed(0x06, VA_L2CC_BASE + L2X0_PREFETCH_CTRL);
  
diff --git a/arch/arm/mach-tegra/Kconfig b/arch/arm/mach-tegra/Kconfig

index 20c3b372cdf531cde6412a6fcdbefce5c5950bf0..84d72fc36dfea1434431cb0b98a388f4b178e667 100644 (file)
--- a/arch/arm/mach-tegra/Kconfig
+++ b/arch/arm/mach-tegra/Kconfig
@@ -63,6 +63,7 @@ config ARCH_TEGRA_114_SOC
         select ARM_ARCH_TIMER
         select ARM_GIC
         select ARM_L1_CACHE_SHIFT_6
+       select CPU_FREQ_TABLE if CPU_FREQ
         select CPU_V7
         select PINCTRL
         select PINCTRL_TEGRA114
diff --git a/arch/arm/mach-ux500/Kconfig b/arch/arm/mach-ux500/Kconfig

index f66d7deae46d9d63b22ffb084605f77b69ae3fa2..6a4387e39df809f1b21d899b89d2212924ae19cd 100644 (file)
--- a/arch/arm/mach-ux500/Kconfig
+++ b/arch/arm/mach-ux500/Kconfig
@@ -19,6 +19,8 @@ if ARCH_U8500
  config UX500_SOC_COMMON
         bool
         default y
+       select ABX500_CORE
+       select AB8500_CORE
         select ARM_ERRATA_754322
         select ARM_ERRATA_764369 if SMP
         select ARM_GIC
diff --git a/arch/arm/mach-ux500/board-mop500.c b/arch/arm/mach-ux500/board-mop500.c

index a15dd6b63a8f8f9848eb396e2baa4ec57a57ffa0..3cd555ac6d0a3e5c81478dfbf71f79f10e72b73b 100644 (file)
--- a/arch/arm/mach-ux500/board-mop500.c
+++ b/arch/arm/mach-ux500/board-mop500.c
@@ -403,8 +403,8 @@ static int mop500_prox_activate(struct device *dev)
                         "no regulator\n");
                 return PTR_ERR(prox_regulator);
         }
-       regulator_enable(prox_regulator);
-       return 0;
+
+       return regulator_enable(prox_regulator);
  }
  
  static void mop500_prox_deactivate(struct device *dev)
diff --git a/arch/arm/mach-ux500/cpu-db8500.c b/arch/arm/mach-ux500/cpu-db8500.c

index 995928ba22fddd857b319892854254b778e17cd1..e90b5ab23b6daf7e691bbff5b74fd6c165e96518 100644 (file)
--- a/arch/arm/mach-ux500/cpu-db8500.c
+++ b/arch/arm/mach-ux500/cpu-db8500.c
@@ -191,7 +191,7 @@ static const char *db8500_read_soc_id(void)
         /* Throw these device-specific numbers into the entropy pool */
         add_device_randomness(uid, 0x14);
         return kasprintf(GFP_KERNEL, "%08x%08x%08x%08x%08x",
-                        readl((u32 *)uid+1),
+                        readl((u32 *)uid+0),
                          readl((u32 *)uid+1), readl((u32 *)uid+2),
                          readl((u32 *)uid+3), readl((u32 *)uid+4));
  }
diff --git a/arch/arm/mach-vexpress/v2m.c b/arch/arm/mach-vexpress/v2m.c

index b6083bb1eb8cf4c9092630a25c43b041c163f7f6..8802030df98d0fbac53f4cd1f01df30f598f1773 100644 (file)
--- a/arch/arm/mach-vexpress/v2m.c
+++ b/arch/arm/mach-vexpress/v2m.c
@@ -450,7 +450,6 @@ static void __init v2m_dt_init(void)
  
  static const char * const v2m_dt_match[] __initconst = {
         "arm,vexpress",
-       "xen,xenvm",
         NULL,
  };
  
diff --git a/arch/arm/mach-virt/virt.c b/arch/arm/mach-virt/virt.c

index adc0945255aea2dee90e170d2426c7a25fc128e0..061f283f579e891b59f8a0c17c0dcafa7bdc4288 100644 (file)
--- a/arch/arm/mach-virt/virt.c
+++ b/arch/arm/mach-virt/virt.c
@@ -32,6 +32,7 @@ static void __init virt_init(void)
  
  static const char *virt_dt_match[] = {
         "linux,dummy-virt",
+       "xen,xenvm",
         NULL
  };
  
diff --git a/arch/arm/plat-orion/Makefile b/arch/arm/plat-orion/Makefile

index 2eca54b65906e1ffbf69ffd9a27b74d71b7415d6..9433605cd290b38aabba4e6d0f94de822d84e743 100644 (file)
--- a/arch/arm/plat-orion/Makefile
+++ b/arch/arm/plat-orion/Makefile
@@ -3,6 +3,6 @@
  #
  ccflags-$(CONFIG_ARCH_MULTIPLATFORM) := -I$(srctree)/$(src)/include
  
-orion-gpio-$(CONFIG_GENERIC_GPIO) += gpio.o
+orion-gpio-$(CONFIG_GPIOLIB)      += gpio.o
  obj-$(CONFIG_PLAT_ORION_LEGACY)   += irq.o pcie.o time.o common.o mpp.o
  obj-$(CONFIG_PLAT_ORION_LEGACY)   += $(orion-gpio-y)
diff --git a/arch/arm/plat-orion/gpio.c b/arch/arm/plat-orion/gpio.c

index e39c2ba6e2fba385aae9b9ce43db23e48d2caa05..249fe6333e180b8240b1734c7066957fdf4c9e41 100644 (file)
--- a/arch/arm/plat-orion/gpio.c
+++ b/arch/arm/plat-orion/gpio.c
@@ -150,7 +150,7 @@ err_out:
  }
  
  /*
- * GENERIC_GPIO primitives.
+ * GPIO primitives.
   */
  static int orion_gpio_request(struct gpio_chip *chip, unsigned pin)
  {
diff --git a/arch/arm/xen/enlighten.c b/arch/arm/xen/enlighten.c

index 8dc0605a9ce9e2d3b5df910fafd6eadb008bfb21..d30042e39974f949e6ab3cc81f88f7c6dfc51c4d 100644 (file)
--- a/arch/arm/xen/enlighten.c
+++ b/arch/arm/xen/enlighten.c
@@ -2,6 +2,7 @@
  #include <xen/events.h>
  #include <xen/grant_table.h>
  #include <xen/hvm.h>
+#include <xen/interface/vcpu.h>
  #include <xen/interface/xen.h>
  #include <xen/interface/memory.h>
  #include <xen/interface/hvm/params.h>
@@ -9,9 +10,11 @@
  #include <xen/platform_pci.h>
  #include <xen/xenbus.h>
  #include <xen/page.h>
+#include <xen/interface/sched.h>
  #include <xen/xen-ops.h>
  #include <asm/xen/hypervisor.h>
  #include <asm/xen/hypercall.h>
+#include <asm/system_misc.h>
  #include <linux/interrupt.h>
  #include <linux/irqreturn.h>
  #include <linux/module.h>
@@ -32,6 +35,7 @@ struct shared_info xen_dummy_shared_info;
  struct shared_info *HYPERVISOR_shared_info = (void *)&xen_dummy_shared_info;
  
  DEFINE_PER_CPU(struct vcpu_info *, xen_vcpu);
+static struct vcpu_info __percpu *xen_vcpu_info;
  
  /* These are unused until we support booting "pre-ballooned" */
  unsigned long xen_released_pages;
@@ -148,6 +152,47 @@ int xen_unmap_domain_mfn_range(struct vm_area_struct *vma,
  }
  EXPORT_SYMBOL_GPL(xen_unmap_domain_mfn_range);
  
+static int __init xen_secondary_init(unsigned int cpu)
+{
+       struct vcpu_register_vcpu_info info;
+       struct vcpu_info *vcpup;
+       int err;
+
+       pr_info("Xen: initializing cpu%d\n", cpu);
+       vcpup = per_cpu_ptr(xen_vcpu_info, cpu);
+
+       info.mfn = __pa(vcpup) >> PAGE_SHIFT;
+       info.offset = offset_in_page(vcpup);
+
+       err = HYPERVISOR_vcpu_op(VCPUOP_register_vcpu_info, cpu, &info);
+       if (err) {
+               pr_debug("register_vcpu_info failed: err=%d\n", err);
+       } else {
+               /* This cpu is using the registered vcpu info, even if
+                  later ones fail to. */
+               per_cpu(xen_vcpu, cpu) = vcpup;
+       }
+       return 0;
+}
+
+static void xen_restart(char str, const char *cmd)
+{
+       struct sched_shutdown r = { .reason = SHUTDOWN_reboot };
+       int rc;
+       rc = HYPERVISOR_sched_op(SCHEDOP_shutdown, &r);
+       if (rc)
+               BUG();
+}
+
+static void xen_power_off(void)
+{
+       struct sched_shutdown r = { .reason = SHUTDOWN_poweroff };
+       int rc;
+       rc = HYPERVISOR_sched_op(SCHEDOP_shutdown, &r);
+       if (rc)
+               BUG();
+}
+
  /*
   * see Documentation/devicetree/bindings/arm/xen.txt for the
   * documentation of the Xen Device Tree format.
@@ -163,6 +208,7 @@ static int __init xen_guest_init(void)
         const char *version = NULL;
         const char *xen_prefix = "xen,xen-";
         struct resource res;
+       int i;
  
         node = of_find_compatible_node(NULL, NULL, "xen,xen");
         if (!node) {
@@ -209,18 +255,26 @@ static int __init xen_guest_init(void)
  
         /* xen_vcpu is a pointer to the vcpu_info struct in the shared_info
          * page, we use it in the event channel upcall and in some pvclock
-        * related functions. We don't need the vcpu_info placement
-        * optimizations because we don't use any pv_mmu or pv_irq op on
-        * HVM.
+        * related functions. 
          * The shared info contains exactly 1 CPU (the boot CPU). The guest
          * is required to use VCPUOP_register_vcpu_info to place vcpu info
-        * for secondary CPUs as they are brought up. */
-       per_cpu(xen_vcpu, 0) = &HYPERVISOR_shared_info->vcpu_info[0];
+        * for secondary CPUs as they are brought up.
+        * For uniformity we use VCPUOP_register_vcpu_info even on cpu0.
+        */
+       xen_vcpu_info = __alloc_percpu(sizeof(struct vcpu_info),
+                                              sizeof(struct vcpu_info));
+       if (xen_vcpu_info == NULL)
+               return -ENOMEM;
+       for_each_online_cpu(i)
+               xen_secondary_init(i);
  
         gnttab_init();
         if (!xen_initial_domain())
                 xenbus_probe(NULL);
  
+       pm_power_off = xen_power_off;
+       arm_pm_restart = xen_restart;
+
         return 0;
  }
  core_initcall(xen_guest_init);
@@ -231,6 +285,11 @@ static irqreturn_t xen_arm_callback(int irq, void *arg)
         return IRQ_HANDLED;
  }
  
+static __init void xen_percpu_enable_events(void *unused)
+{
+       enable_percpu_irq(xen_events_irq, 0);
+}
+
  static int __init xen_init_events(void)
  {
         if (!xen_domain() || xen_events_irq < 0)
@@ -239,12 +298,12 @@ static int __init xen_init_events(void)
         xen_init_IRQ();
  
         if (request_percpu_irq(xen_events_irq, xen_arm_callback,
-                       "events", xen_vcpu)) {
+                       "events", &xen_vcpu)) {
                 pr_err("Error requesting IRQ %d\n", xen_events_irq);
                 return -EINVAL;
         }
  
-       enable_percpu_irq(xen_events_irq, 0);
+       on_each_cpu(xen_percpu_enable_events, NULL, 0);
  
         return 0;
  }
@@ -259,4 +318,5 @@ EXPORT_SYMBOL_GPL(HYPERVISOR_sched_op);
  EXPORT_SYMBOL_GPL(HYPERVISOR_hvm_op);
  EXPORT_SYMBOL_GPL(HYPERVISOR_memory_op);
  EXPORT_SYMBOL_GPL(HYPERVISOR_physdev_op);
+EXPORT_SYMBOL_GPL(HYPERVISOR_vcpu_op);
  EXPORT_SYMBOL_GPL(privcmd_call);
diff --git a/arch/arm/xen/hypercall.S b/arch/arm/xen/hypercall.S

index 71f723984cbd94eced133e49bc7fd0db8dbdf92b..199cb2da76637929cd50bf1301f7771faefe8415 100644 (file)
--- a/arch/arm/xen/hypercall.S
+++ b/arch/arm/xen/hypercall.S
@@ -87,6 +87,7 @@ HYPERCALL2(event_channel_op);
  HYPERCALL2(hvm_op);
  HYPERCALL2(memory_op);
  HYPERCALL2(physdev_op);
+HYPERCALL3(vcpu_op);
  
  ENTRY(privcmd_call)
         stmdb sp!, {r4}
diff --git a/arch/arm64/Kconfig b/arch/arm64/Kconfig

index 73b6e764034c7f836b3b2b12de9e63a99a858de4..48347dcf056681641936a8cedcfeb88272fc8f7e 100644 (file)
--- a/arch/arm64/Kconfig
+++ b/arch/arm64/Kconfig
@@ -6,6 +6,7 @@ config ARM64
         select ARCH_WANT_FRAME_POINTERS
         select ARM_AMBA
         select ARM_ARCH_TIMER
+       select ARM_GIC
         select CLONE_BACKWARDS
         select COMMON_CLK
         select GENERIC_CLOCKEVENTS
@@ -31,6 +32,8 @@ config ARM64
         select OF
         select OF_EARLY_FLATTREE
         select PERF_USE_VMALLOC
+       select POWER_RESET
+       select POWER_SUPPLY
         select RTC_LIB
         select SPARSE_IRQ
         select SYSCTL_EXCEPTION_TRACE
@@ -92,9 +95,6 @@ config SWIOTLB
  config IOMMU_HELPER
         def_bool SWIOTLB
  
-config GENERIC_GPIO
-       bool
-
  source "init/Kconfig"
  
  source "kernel/Kconfig.freezer"
@@ -105,6 +105,7 @@ config ARCH_VEXPRESS
         bool "ARMv8 software model (Versatile Express)"
         select ARCH_REQUIRE_GPIOLIB
         select COMMON_CLK_VERSATILE
+       select POWER_RESET_VEXPRESS
         select VEXPRESS_CONFIG
         help
           This enables support for the ARMv8 software model (Versatile
diff --git a/arch/arm64/boot/dts/foundation-v8.dts b/arch/arm64/boot/dts/foundation-v8.dts

index 198682b6de313a136d6a08571eb392876bba59ba..84fcc5018284b6cee3dca436dd4ec7634e3b00cc 100644 (file)
--- a/arch/arm64/boot/dts/foundation-v8.dts
+++ b/arch/arm64/boot/dts/foundation-v8.dts
@@ -23,7 +23,7 @@
         };
  
         cpus {
-               #address-cells = <1>;
+               #address-cells = <2>;
                 #size-cells = <0>;
  
                 cpu@0 {
diff --git a/arch/arm64/include/asm/system_misc.h b/arch/arm64/include/asm/system_misc.h

index 95e407255347b49971e59b3273148ba582166ee6..a6e1750369efcb32ea589b6450589515bda1c283 100644 (file)
--- a/arch/arm64/include/asm/system_misc.h
+++ b/arch/arm64/include/asm/system_misc.h
@@ -41,7 +41,7 @@ extern void show_pte(struct mm_struct *mm, unsigned long addr);
  extern void __show_regs(struct pt_regs *);
  
  void soft_restart(unsigned long);
-extern void (*pm_restart)(const char *cmd);
+extern void (*arm_pm_restart)(char str, const char *cmd);
  
  #define UDBG_UNDEFINED (1 << 0)
  #define UDBG_SYSCALL   (1 << 1)
diff --git a/arch/arm64/kernel/process.c b/arch/arm64/kernel/process.c

index f4919721f7dde20ea0fabb3369395c2359b7cc34..46f02c3b5015ece9b1ca0ab4d57d9540a3288e67 100644 (file)
--- a/arch/arm64/kernel/process.c
+++ b/arch/arm64/kernel/process.c
@@ -81,8 +81,8 @@ void soft_restart(unsigned long addr)
  void (*pm_power_off)(void);
  EXPORT_SYMBOL_GPL(pm_power_off);
  
-void (*pm_restart)(const char *cmd);
-EXPORT_SYMBOL_GPL(pm_restart);
+void (*arm_pm_restart)(char str, const char *cmd);
+EXPORT_SYMBOL_GPL(arm_pm_restart);
  
  void arch_cpu_idle_prepare(void)
  {
@@ -131,8 +131,8 @@ void machine_restart(char *cmd)
         local_fiq_disable();
  
         /* Now call the architecture specific reboot code. */
-       if (pm_restart)
-               pm_restart(cmd);
+       if (arm_pm_restart)
+               arm_pm_restart('h', cmd);
  
         /*
          * Whoops - the architecture was unable to reboot.
diff --git a/arch/arm64/lib/bitops.S b/arch/arm64/lib/bitops.S

index 36216d30cb9af72167f44b76cce35ab81de4ad8b..e5db797790d3265c5418d749b7537c494ddeb250 100644 (file)
--- a/arch/arm64/lib/bitops.S
+++ b/arch/arm64/lib/bitops.S
@@ -21,13 +21,13 @@
  
  /*
   * x0: bits 5:0  bit offset
- *     bits 63:6 word offset
+ *     bits 31:6 word offset
   * x1: address
   */
         .macro  bitop, name, instr
  ENTRY( \name   )
-       and     x3, x0, #63             // Get bit offset
-       eor     x0, x0, x3              // Clear low bits
+       and     w3, w0, #63             // Get bit offset
+       eor     w0, w0, w3              // Clear low bits
         mov     x2, #1
         add     x1, x1, x0, lsr #3      // Get word offset
         lsl     x3, x2, x3              // Create mask
@@ -41,8 +41,8 @@ ENDPROC(\name )
  
         .macro  testop, name, instr
  ENTRY( \name   )
-       and     x3, x0, #63             // Get bit offset
-       eor     x0, x0, x3              // Clear low bits
+       and     w3, w0, #63             // Get bit offset
+       eor     w0, w0, w3              // Clear low bits
         mov     x2, #1
         add     x1, x1, x0, lsr #3      // Get word offset
         lsl     x4, x2, x3              // Create mask
diff --git a/arch/arm64/mm/fault.c b/arch/arm64/mm/fault.c

index 52638171d6fdc3f1e287442e1564fe48a41b526e..98af6e760cce6781a5b117d786bcd16fc136a0b1 100644 (file)
--- a/arch/arm64/mm/fault.c
+++ b/arch/arm64/mm/fault.c
@@ -148,6 +148,7 @@ void do_bad_area(unsigned long addr, unsigned int esr, struct pt_regs *regs)
  #define VM_FAULT_BADACCESS     0x020000
  
  #define ESR_WRITE              (1 << 6)
+#define ESR_CM                 (1 << 8)
  #define ESR_LNX_EXEC           (1 << 24)
  
  /*
@@ -206,7 +207,7 @@ static int __kprobes do_page_fault(unsigned long addr, unsigned int esr,
         struct task_struct *tsk;
         struct mm_struct *mm;
         int fault, sig, code;
-       int write = esr & ESR_WRITE;
+       bool write = (esr & ESR_WRITE) && !(esr & ESR_CM);
         unsigned int flags = FAULT_FLAG_ALLOW_RETRY | FAULT_FLAG_KILLABLE |
                 (write ? FAULT_FLAG_WRITE : 0);
  
diff --git a/arch/avr32/Kconfig b/arch/avr32/Kconfig

index 22c40308360bb0a76071db11c5493ecd09aff6af..bdc35589277f721805eed5675fe0bfabd8dc0aec 100644 (file)
--- a/arch/avr32/Kconfig
+++ b/arch/avr32/Kconfig
@@ -26,9 +26,6 @@ config AVR32
           There is an AVR32 Linux project with a web page at
           http://avr32linux.org/.
  
-config GENERIC_GPIO
-       def_bool y
-
  config STACKTRACE_SUPPORT
         def_bool y
  
diff --git a/arch/blackfin/Kconfig b/arch/blackfin/Kconfig

index 453ebe46b065d01d0afa272528220f9e73aede52..a117652b5feaac84e4bb880bb8b1b4772c8b6f5a 100644 (file)
--- a/arch/blackfin/Kconfig
+++ b/arch/blackfin/Kconfig
@@ -27,7 +27,7 @@ config BLACKFIN
         select HAVE_OPROFILE
         select HAVE_PERF_EVENTS
         select ARCH_HAVE_CUSTOM_GPIO_H
-       select ARCH_WANT_OPTIONAL_GPIOLIB
+       select ARCH_REQUIRE_GPIOLIB
         select HAVE_UID16
         select HAVE_UNDERSCORE_SYMBOL_PREFIX
         select VIRT_TO_BUS
@@ -52,9 +52,6 @@ config GENERIC_BUG
  config ZONE_DMA
         def_bool y
  
-config GENERIC_GPIO
-       def_bool y
-
  config FORCE_MAX_ZONEORDER
         int
         default "14"
diff --git a/arch/cris/Kconfig b/arch/cris/Kconfig

index 06dd026533e3b758b68091bc0a8c754b3084f0da..8769a9045a543995c137f02f9bfc92bf3ce0da36 100644 (file)
--- a/arch/cris/Kconfig
+++ b/arch/cris/Kconfig
@@ -264,7 +264,6 @@ config ETRAX_AXISFLASHMAP
         select MTD_CFI
         select MTD_CFI_AMDSTD
         select MTD_JEDECPROBE if ETRAX_ARCH_V32
-       select MTD_CHAR
         select MTD_BLOCK
         select MTD_COMPLEX_MAPPINGS
         help
diff --git a/arch/cris/arch-v32/drivers/Kconfig b/arch/cris/arch-v32/drivers/Kconfig

index af4a486dadcd8437ba7160b1852664a04888cedb..c55971a40c34e8433faa9fc70d12566feed21045 100644 (file)
--- a/arch/cris/arch-v32/drivers/Kconfig
+++ b/arch/cris/arch-v32/drivers/Kconfig
@@ -404,7 +404,6 @@ config ETRAX_AXISFLASHMAP
         select MTD_CFI
         select MTD_CFI_AMDSTD
         select MTD_JEDECPROBE
-       select MTD_CHAR
         select MTD_BLOCK
         select MTD_COMPLEX_MAPPINGS
         help
diff --git a/arch/cris/kernel/profile.c b/arch/cris/kernel/profile.c

index b82e08615d1bb81960a9010aeba114f522b80657..cd9f15b92f8f152cacb69c68abf59aea81833afe 100644 (file)
--- a/arch/cris/kernel/profile.c
+++ b/arch/cris/kernel/profile.c
@@ -76,7 +76,7 @@ static int __init init_cris_profile(void)
         entry = proc_create("system_profile", S_IWUSR | S_IRUGO, NULL,
                             &cris_proc_profile_operations);
         if (entry) {
-               entry->size = SAMPLE_BUFFER_SIZE;
+               proc_set_size(entry, SAMPLE_BUFFER_SIZE);
         }
         prof_running = 1;
  
diff --git a/arch/hexagon/Kconfig b/arch/hexagon/Kconfig

index 04dff5bdcbf72b30aebc5c2c5287d247eb3e7d2d..33a97929d055ca9d1131ffce9c517b3a4597bb38 100644 (file)
--- a/arch/hexagon/Kconfig
+++ b/arch/hexagon/Kconfig
@@ -30,8 +30,6 @@ config HEXAGON
         select GENERIC_CLOCKEVENTS_BROADCAST
         select MODULES_USE_ELF_RELA
         select GENERIC_CPU_DEVICES
-       select GENERIC_KERNEL_THREAD
-       select GENERIC_KERNEL_EXECVE
         ---help---
           Qualcomm Hexagon is a processor architecture designed for high
           performance and low power across a wide variety of applications.
@@ -157,9 +155,6 @@ source "mm/Kconfig"
  
  source "kernel/Kconfig.hz"
  
-config GENERIC_GPIO
-       def_bool n
-
  endmenu
  
  source "init/Kconfig"
diff --git a/arch/hexagon/kernel/vm_entry.S b/arch/hexagon/kernel/vm_entry.S

index e3086185fc9ff69a401993597c9a46aba408848d..67c6ccc14770320c21f334b16c13e1c480a99cd7 100644 (file)
--- a/arch/hexagon/kernel/vm_entry.S
+++ b/arch/hexagon/kernel/vm_entry.S
@@ -291,12 +291,12 @@ event_dispatch:
         /*  "Nested control path" -- if the previous mode was kernel  */
         {
                 R0 = memw(R29 + #_PT_ER_VMEST);
-               R16.L = #LO(do_work_pending);
+               R26.L = #LO(do_work_pending);
         }
         {
                 P0 = tstbit(R0, #HVM_VMEST_UM_SFT);
                 if (!P0.new) jump:nt restore_all;
-               R16.H = #HI(do_work_pending);
+               R26.H = #HI(do_work_pending);
                 R0 = #VM_INT_DISABLE;
         }
  
@@ -304,7 +304,7 @@ event_dispatch:
          * Check also the return from fork/system call, normally coming back from
          * user mode
          *
-        * R16 needs to have do_work_pending, and R0 should have VM_INT_DISABLE
+        * R26 needs to have do_work_pending, and R0 should have VM_INT_DISABLE
          */
  
  check_work_pending:
@@ -313,7 +313,7 @@ check_work_pending:
         {
                 R0 = R29;  /*  regs should still be at top of stack  */
                 R1 = memw(THREADINFO_REG + #_THREAD_INFO_FLAGS);
-               callr R16;
+               callr R26;
         }
  
         {
@@ -375,11 +375,11 @@ _K_enter_debug:
  ret_from_fork:
         {
                 call schedule_tail
-               R16.H = #HI(do_work_pending);
+               R26.H = #HI(do_work_pending);
         }
         {
                 P0 = cmp.eq(R24, #0);
-               R16.L = #LO(do_work_pending);
+               R26.L = #LO(do_work_pending);
                 R0 = #VM_INT_DISABLE;
         }
         if P0 jump check_work_pending
diff --git a/arch/ia64/Kconfig b/arch/ia64/Kconfig

index d393f841ff5acb660f120de6c545803e9b0b6dd3..1a2b7749b0478a75c49e58df2862de0765d71980 100644 (file)
--- a/arch/ia64/Kconfig
+++ b/arch/ia64/Kconfig
@@ -101,9 +101,6 @@ config GENERIC_CALIBRATE_DELAY
  config HAVE_SETUP_PER_CPU_AREA
         def_bool y
  
-config GENERIC_GPIO
-       bool
-
  config DMI
         bool
         default y
diff --git a/arch/m68k/Kconfig b/arch/m68k/Kconfig

index 6de813370b8c7d01e8b98d78634baef15da6fe3b..821170e5f6ed4e29e2135af409258d60d7fb4b4f 100644 (file)
--- a/arch/m68k/Kconfig
+++ b/arch/m68k/Kconfig
@@ -35,9 +35,6 @@ config ARCH_HAS_ILOG2_U32
  config ARCH_HAS_ILOG2_U64
         bool
  
-config GENERIC_GPIO
-       bool
-
  config GENERIC_HWEIGHT
         bool
         default y
diff --git a/arch/m68k/Kconfig.cpu b/arch/m68k/Kconfig.cpu

index b1cfff832fb5c56a6543d075ddbccee25fb7305e..d266787725b468ab94286aeb3e7cca62698c5d6c 100644 (file)
--- a/arch/m68k/Kconfig.cpu
+++ b/arch/m68k/Kconfig.cpu
@@ -22,8 +22,7 @@ config M68KCLASSIC
  
  config COLDFIRE
         bool "Coldfire CPU family support"
-       select GENERIC_GPIO
-       select ARCH_WANT_OPTIONAL_GPIOLIB
+       select ARCH_REQUIRE_GPIOLIB
         select ARCH_HAVE_CUSTOM_GPIO_H
         select CPU_HAS_NO_BITFIELDS
         select CPU_HAS_NO_MULDIV64
diff --git a/arch/metag/Kconfig b/arch/metag/Kconfig

index 6f16c14693272e593aa0f98d33e6b62d60f8ce79..dcd94406030e3e9cebd2a15a01260c389d2d24a8 100644 (file)
--- a/arch/metag/Kconfig
+++ b/arch/metag/Kconfig
@@ -52,9 +52,6 @@ config GENERIC_HWEIGHT
  config GENERIC_CALIBRATE_DELAY
         def_bool y
  
-config GENERIC_GPIO
-       def_bool n
-
  config NO_IOPORT
         def_bool y
  
diff --git a/arch/microblaze/Kconfig b/arch/microblaze/Kconfig

index 54237af0b07c8736a46554bcf176a0d22ae2ef44..d22a4ecffff422f542e96f81527b45a4e3f9c6c2 100644 (file)
--- a/arch/microblaze/Kconfig
+++ b/arch/microblaze/Kconfig
@@ -54,9 +54,6 @@ config GENERIC_HWEIGHT
  config GENERIC_CALIBRATE_DELAY
         def_bool y
  
-config GENERIC_GPIO
-       bool
-
  config GENERIC_CSUM
         def_bool y
  
diff --git a/arch/mips/Kconfig b/arch/mips/Kconfig

index e5f3794744f14c12707d8631de62341e56e36ecb..a90cfc702bb1a31cade6c98d0bd4eb0ead652f8d 100644 (file)
--- a/arch/mips/Kconfig
+++ b/arch/mips/Kconfig
@@ -61,8 +61,7 @@ config MIPS_ALCHEMY
         select SYS_HAS_CPU_MIPS32_R1
         select SYS_SUPPORTS_32BIT_KERNEL
         select SYS_SUPPORTS_APM_EMULATION
-       select GENERIC_GPIO
-       select ARCH_WANT_OPTIONAL_GPIOLIB
+       select ARCH_REQUIRE_GPIOLIB
         select SYS_SUPPORTS_ZBOOT
         select USB_ARCH_HAS_OHCI
         select USB_ARCH_HAS_EHCI
@@ -225,7 +224,6 @@ config MACH_JZ4740
         select SYS_SUPPORTS_ZBOOT_UART16550
         select DMA_NONCOHERENT
         select IRQ_CPU
-       select GENERIC_GPIO
         select ARCH_REQUIRE_GPIOLIB
         select SYS_HAS_EARLY_PRINTK
         select HAVE_PWM
@@ -937,7 +935,6 @@ config CSRC_SB1250
         bool
  
  config GPIO_TXX9
-       select GENERIC_GPIO
         select ARCH_REQUIRE_GPIOLIB
         bool
  
@@ -1009,9 +1006,6 @@ config GENERIC_ISA_DMA_SUPPORT_BROKEN
  config ISA_DMA_API
         bool
  
-config GENERIC_GPIO
-       bool
-
  config HOLES_IN_ZONE
         bool
  
@@ -1112,7 +1106,6 @@ config SOC_PNX833X
         select SYS_SUPPORTS_32BIT_KERNEL
         select SYS_SUPPORTS_LITTLE_ENDIAN
         select SYS_SUPPORTS_BIG_ENDIAN
-       select GENERIC_GPIO
         select CPU_MIPSR2_IRQ_VI
  
  config SOC_PNX8335
@@ -1203,7 +1196,6 @@ config CPU_LOONGSON2F
         bool "Loongson 2F"
         depends on SYS_HAS_CPU_LOONGSON2F
         select CPU_LOONGSON2
-       select GENERIC_GPIO
         select ARCH_REQUIRE_GPIOLIB
         help
           The Loongson 2F processor implements the MIPS III instruction set
diff --git a/arch/mips/loongson/common/Makefile b/arch/mips/loongson/common/Makefile

index e526488df6559895f282e6d7b958b8bd81fca7c2..4c57b3e5743f749f8be549605cabdbf56b2d677b 100644 (file)
--- a/arch/mips/loongson/common/Makefile
+++ b/arch/mips/loongson/common/Makefile
@@ -4,7 +4,7 @@
  
  obj-y += setup.o init.o cmdline.o env.o time.o reset.o irq.o \
      pci.o bonito-irq.o mem.o machtype.o platform.o
-obj-$(CONFIG_GENERIC_GPIO) += gpio.o
+obj-$(CONFIG_GPIOLIB) += gpio.o
  
  #
  # Serial port support
diff --git a/arch/mips/txx9/generic/setup.c b/arch/mips/txx9/generic/setup.c

index 5524f2c7b05c5e08298a7c3f25f7c1abb9e4113c..5364aabc21027951532bc5d0e7d97898344f701a 100644 (file)
--- a/arch/mips/txx9/generic/setup.c
+++ b/arch/mips/txx9/generic/setup.c
@@ -118,7 +118,7 @@ EXPORT_SYMBOL(clk_put);
  
  /* GPIO support */
  
-#ifdef CONFIG_GENERIC_GPIO
+#ifdef CONFIG_GPIOLIB
  int gpio_to_irq(unsigned gpio)
  {
         return -EINVAL;
diff --git a/arch/openrisc/Kconfig b/arch/openrisc/Kconfig

index 81b9ddbc9166473ef46a85c7640a135807c920c8..1072bfd18c5083034aabb20116c21f144f88b55e 100644 (file)
--- a/arch/openrisc/Kconfig
+++ b/arch/openrisc/Kconfig
@@ -44,9 +44,6 @@ config GENERIC_HWEIGHT
  config NO_IOPORT
         def_bool y
  
-config GENERIC_GPIO
-       def_bool y
-
  config TRACE_IRQFLAGS_SUPPORT
          def_bool y
  
diff --git a/arch/parisc/Kconfig b/arch/parisc/Kconfig

index 433e75a2ee9afa2ff315bf39a97a429ebc64f6a4..cad060f288cf51e089bfb9cf7b6e879064bc0989 100644 (file)
--- a/arch/parisc/Kconfig
+++ b/arch/parisc/Kconfig
@@ -13,6 +13,7 @@ config PARISC
         select BUG
         select HAVE_PERF_EVENTS
         select GENERIC_ATOMIC64 if !64BIT
+       select ARCH_HAS_ATOMIC64_DEC_IF_POSITIVE
         select HAVE_GENERIC_HARDIRQS
         select BROKEN_RODATA
         select GENERIC_IRQ_PROBE
@@ -242,6 +243,14 @@ config SMP
  
           If you don't know what to do here, say N.
  
+config IRQSTACKS
+       bool "Use separate kernel stacks when processing interrupts"
+       default n
+       help
+         If you say Y here the kernel will use separate kernel stacks
+         for handling hard and soft interrupts.  This can help avoid
+         overflowing the process kernel stacks.
+
  config HOTPLUG_CPU
         bool
         default y if SMP
diff --git a/arch/parisc/Kconfig.debug b/arch/parisc/Kconfig.debug

index bc989e522a045c17ca4514478b7cb5ef9d77fc4d..08a332f6ee874d814df7a97615dd44821add3c13 100644 (file)
--- a/arch/parisc/Kconfig.debug
+++ b/arch/parisc/Kconfig.debug
@@ -13,3 +13,14 @@ config DEBUG_RODATA
           If in doubt, say "N".
  
  endmenu
+
+config DEBUG_STACKOVERFLOW
+       bool "Check for stack overflows"
+       default y
+       depends on DEBUG_KERNEL
+       ---help---
+         Say Y here if you want to check the overflows of kernel, IRQ
+         and exception stacks. This option will cause messages of the
+         stacks in detail when free stack space drops below a certain
+         limit.
+         If in doubt, say "N".
diff --git a/arch/parisc/Makefile b/arch/parisc/Makefile

index 113e28206503ae55e8915d7382d17dc059a20eee..2f967cc6649e0cab325136624a614af4f0784281 100644 (file)
--- a/arch/parisc/Makefile
+++ b/arch/parisc/Makefile
@@ -24,9 +24,7 @@ CHECKFLAGS    += -D__hppa__=1
  LIBGCC         = $(shell $(CC) $(KBUILD_CFLAGS) -print-libgcc-file-name)
  
  MACHINE                := $(shell uname -m)
-ifeq ($(MACHINE),parisc*)
-NATIVE         := 1
-endif
+NATIVE         := $(if $(filter parisc%,$(MACHINE)),1,0)
  
  ifdef CONFIG_64BIT
  UTS_MACHINE    := parisc64
diff --git a/arch/parisc/include/asm/atomic.h b/arch/parisc/include/asm/atomic.h

index f38e1984b242f8df63e470bc878d62e8d3f4c447..472886ceab1dde20c5ac3928c4f71dd2642d547a 100644 (file)
--- a/arch/parisc/include/asm/atomic.h
+++ b/arch/parisc/include/asm/atomic.h
@@ -229,6 +229,29 @@ static __inline__ int atomic64_add_unless(atomic64_t *v, long a, long u)
  
  #define atomic64_inc_not_zero(v) atomic64_add_unless((v), 1, 0)
  
+/*
+ * atomic64_dec_if_positive - decrement by 1 if old value positive
+ * @v: pointer of type atomic_t
+ *
+ * The function returns the old value of *v minus 1, even if
+ * the atomic variable, v, was not decremented.
+ */
+static inline long atomic64_dec_if_positive(atomic64_t *v)
+{
+       long c, old, dec;
+       c = atomic64_read(v);
+       for (;;) {
+               dec = c - 1;
+               if (unlikely(dec < 0))
+                       break;
+               old = atomic64_cmpxchg((v), c, dec);
+               if (likely(old == c))
+                       break;
+               c = old;
+       }
+       return dec;
+}
+
  #endif /* !CONFIG_64BIT */
  
  
diff --git a/arch/parisc/include/asm/dma-mapping.h b/arch/parisc/include/asm/dma-mapping.h

index 106b395688e1d167bf764a76de70d5d58e4109d7..d0eae5f2bd8795e244d2a086f30e7d78439050f3 100644 (file)
--- a/arch/parisc/include/asm/dma-mapping.h
+++ b/arch/parisc/include/asm/dma-mapping.h
@@ -46,6 +46,9 @@ extern struct hppa_dma_ops pcx_dma_ops;
  
  extern struct hppa_dma_ops *hppa_dma_ops;
  
+#define dma_alloc_attrs(d, s, h, f, a) dma_alloc_coherent(d, s, h, f)
+#define dma_free_attrs(d, s, h, f, a) dma_free_coherent(d, s, h, f)
+
  static inline void *
  dma_alloc_coherent(struct device *dev, size_t size, dma_addr_t *dma_handle,
                    gfp_t flag)
diff --git a/arch/parisc/include/asm/hardirq.h b/arch/parisc/include/asm/hardirq.h

index 0d68184a76cb54006aeb5142c54315564eabd7aa..12373c4dababec920c0ec42f120064011567d4b6 100644 (file)
--- a/arch/parisc/include/asm/hardirq.h
+++ b/arch/parisc/include/asm/hardirq.h
@@ -1,11 +1,41 @@
  /* hardirq.h: PA-RISC hard IRQ support.
   *
   * Copyright (C) 2001 Matthew Wilcox <matthew@wil.cx>
+ * Copyright (C) 2013 Helge Deller <deller@gmx.de>
   */
  
  #ifndef _PARISC_HARDIRQ_H
  #define _PARISC_HARDIRQ_H
  
-#include <asm-generic/hardirq.h>
+#include <linux/cache.h>
+#include <linux/threads.h>
+#include <linux/irq.h>
+
+typedef struct {
+       unsigned int __softirq_pending;
+#ifdef CONFIG_DEBUG_STACKOVERFLOW
+       unsigned int kernel_stack_usage;
+#endif
+#ifdef CONFIG_SMP
+       unsigned int irq_resched_count;
+       unsigned int irq_call_count;
+#endif
+       unsigned int irq_tlb_count;
+} ____cacheline_aligned irq_cpustat_t;
+
+DECLARE_PER_CPU_SHARED_ALIGNED(irq_cpustat_t, irq_stat);
+
+#define __ARCH_IRQ_STAT
+#define __IRQ_STAT(cpu, member) (irq_stat[cpu].member)
+#define inc_irq_stat(member)   this_cpu_inc(irq_stat.member)
+#define local_softirq_pending()        this_cpu_read(irq_stat.__softirq_pending)
+
+#define __ARCH_SET_SOFTIRQ_PENDING
+
+#define set_softirq_pending(x) \
+               this_cpu_write(irq_stat.__softirq_pending, (x))
+#define or_softirq_pending(x)  this_cpu_or(irq_stat.__softirq_pending, (x))
+
+#define ack_bad_irq(irq) WARN(1, "unexpected IRQ trap at vector %02x\n", irq)
  
  #endif /* _PARISC_HARDIRQ_H */
diff --git a/arch/parisc/include/asm/processor.h b/arch/parisc/include/asm/processor.h

index 09b54a57a48d025283af4816d319e14a911e6158..064015547d1e32be0a968ba946bc54f8db362895 100644 (file)
--- a/arch/parisc/include/asm/processor.h
+++ b/arch/parisc/include/asm/processor.h
@@ -20,8 +20,6 @@
  
  #endif /* __ASSEMBLY__ */
  
-#define KERNEL_STACK_SIZE      (4*PAGE_SIZE)
-
  /*
   * Default implementation of macro that returns current
   * instruction pointer ("program counter").
@@ -60,6 +58,23 @@
  
  #ifndef __ASSEMBLY__
  
+/*
+ * IRQ STACK - used for irq handler
+ */
+#ifdef __KERNEL__
+
+#define IRQ_STACK_SIZE      (4096 << 2) /* 16k irq stack size */
+
+union irq_stack_union {
+       unsigned long stack[IRQ_STACK_SIZE/sizeof(unsigned long)];
+};
+
+DECLARE_PER_CPU(union irq_stack_union, irq_stack_union);
+
+void call_on_stack(unsigned long p1, void *func, unsigned long new_stack);
+
+#endif /* __KERNEL__ */
+
  /*
   * Data detected about CPUs at boot time which is the same for all CPU's.
   * HP boxes are SMP - ie identical processors.
@@ -97,7 +112,6 @@ struct cpuinfo_parisc {
         unsigned long txn_addr;     /* MMIO addr of EIR or id_eid */
  #ifdef CONFIG_SMP
         unsigned long pending_ipi;  /* bitmap of type ipi_message_type */
-       unsigned long ipi_count;    /* number ipi Interrupts */
  #endif
         unsigned long bh_count;     /* number of times bh was invoked */
         unsigned long prof_counter; /* per CPU profiling support */
diff --git a/arch/parisc/include/asm/thread_info.h b/arch/parisc/include/asm/thread_info.h

index 6182832e5b6c9c166597f2627ecc9463a38a7d1b..540c88fa8f863d44adcc254fe9a3917cf015aa4d 100644 (file)
--- a/arch/parisc/include/asm/thread_info.h
+++ b/arch/parisc/include/asm/thread_info.h
@@ -40,7 +40,7 @@ struct thread_info {
  
  /* thread information allocation */
  
-#define THREAD_SIZE_ORDER            2
+#define THREAD_SIZE_ORDER      2 /* PA-RISC requires at least 16k stack */
  /* Be sure to hunt all references to this down when you change the size of
   * the kernel stack */
  #define THREAD_SIZE             (PAGE_SIZE << THREAD_SIZE_ORDER)
diff --git a/arch/parisc/include/asm/tlbflush.h b/arch/parisc/include/asm/tlbflush.h

index 8f1a8100bf2df394f0ca78e90dbec9f33d33125d..5273da991e062c841ac40efdb0f3174a4df8cde3 100644 (file)
--- a/arch/parisc/include/asm/tlbflush.h
+++ b/arch/parisc/include/asm/tlbflush.h
@@ -22,6 +22,8 @@ extern spinlock_t pa_tlb_lock;
  extern void flush_tlb_all(void);
  extern void flush_tlb_all_local(void *);
  
+#define smp_flush_tlb_all()    flush_tlb_all()
+
  /*
   * flush_tlb_mm()
   *
diff --git a/arch/parisc/kernel/cache.c b/arch/parisc/kernel/cache.c

index 83ded26cad065af6cda1d1dc725dec381e48a669..65fb4cbc3a0ffd88f7f100870e8b0c952de573e7 100644 (file)
--- a/arch/parisc/kernel/cache.c
+++ b/arch/parisc/kernel/cache.c
@@ -606,7 +606,7 @@ void clear_user_highpage(struct page *page, unsigned long vaddr)
         /* Clear using TMPALIAS region.  The page doesn't need to
            be flushed but the kernel mapping needs to be purged.  */
  
-       vto = kmap_atomic(page, KM_USER0);
+       vto = kmap_atomic(page);
  
         /* The PA-RISC 2.0 Architecture book states on page F-6:
            "Before a write-capable translation is enabled, *all*
@@ -641,8 +641,8 @@ void copy_user_highpage(struct page *to, struct page *from,
            the `to' page must be flushed in copy_user_page_asm since
            it can be used to bring in executable code.  */
  
-       vfrom = kmap_atomic(from, KM_USER0);
-       vto = kmap_atomic(to, KM_USER1);
+       vfrom = kmap_atomic(from);
+       vto = kmap_atomic(to);
  
         purge_kernel_dcache_page_asm((unsigned long)vto);
         purge_tlb_start(flags);
diff --git a/arch/parisc/kernel/entry.S b/arch/parisc/kernel/entry.S

index f33201bf8977208358b87c92911cb149a68f618a..4bb96ad9b0b14ddd1cb066bc21b304d861059f38 100644 (file)
--- a/arch/parisc/kernel/entry.S
+++ b/arch/parisc/kernel/entry.S
@@ -400,7 +400,15 @@
  #if PT_NLEVELS == 3
         extru           \va,31-ASM_PMD_SHIFT,ASM_BITS_PER_PMD,\index
  #else
+# if defined(CONFIG_64BIT)
+       extrd,u         \va,63-ASM_PGDIR_SHIFT,ASM_BITS_PER_PGD,\index
+  #else
+  # if PAGE_SIZE > 4096
+       extru           \va,31-ASM_PGDIR_SHIFT,32-ASM_PGDIR_SHIFT,\index
+  # else
         extru           \va,31-ASM_PGDIR_SHIFT,ASM_BITS_PER_PGD,\index
+  # endif
+# endif
  #endif
         dep             %r0,31,PAGE_SHIFT,\pmd  /* clear offset */
         copy            %r0,\pte
@@ -615,7 +623,7 @@
  
         .text
  
-       .align  PAGE_SIZE
+       .align 4096
  
  ENTRY(fault_vector_20)
         /* First vector is invalid (0) */
@@ -825,11 +833,6 @@ ENTRY(syscall_exit_rfi)
         STREG   %r19,PT_SR7(%r16)
  
  intr_return:
-       /* NOTE: Need to enable interrupts incase we schedule. */
-       ssm     PSW_SM_I, %r0
-
-intr_check_resched:
-
         /* check for reschedule */
         mfctl   %cr30,%r1
         LDREG   TI_FLAGS(%r1),%r19      /* sched.h: TIF_NEED_RESCHED */
@@ -856,6 +859,11 @@ intr_check_sig:
         LDREG   PT_IASQ1(%r16), %r20
         cmpib,COND(=),n 0,%r20,intr_restore /* backward */
  
+       /* NOTE: We need to enable interrupts if we have to deliver
+        * signals. We used to do this earlier but it caused kernel
+        * stack overflows. */
+       ssm     PSW_SM_I, %r0
+
         copy    %r0, %r25                       /* long in_syscall = 0 */
  #ifdef CONFIG_64BIT
         ldo     -16(%r30),%r29                  /* Reference param save area */
@@ -907,6 +915,10 @@ intr_do_resched:
         cmpib,COND(=)   0, %r20, intr_do_preempt
         nop
  
+       /* NOTE: We need to enable interrupts if we schedule.  We used
+        * to do this earlier but it caused kernel stack overflows. */
+       ssm     PSW_SM_I, %r0
+
  #ifdef CONFIG_64BIT
         ldo     -16(%r30),%r29          /* Reference param save area */
  #endif
@@ -1694,7 +1706,8 @@ ENTRY(sys_\name\()_wrapper)
         ldo     TASK_REGS(%r1),%r1
         reg_save %r1
         mfctl   %cr27, %r28
-       b       sys_\name
+       ldil    L%sys_\name, %r31
+       be      R%sys_\name(%sr4,%r31)
         STREG   %r28, PT_CR27(%r1)
  ENDPROC(sys_\name\()_wrapper)
         .endm
@@ -1997,6 +2010,47 @@ ftrace_stub:
  ENDPROC(return_to_handler)
  #endif /* CONFIG_FUNCTION_TRACER */
  
+#ifdef CONFIG_IRQSTACKS
+/* void call_on_stack(unsigned long param1, void *func,
+                     unsigned long new_stack) */
+ENTRY(call_on_stack)
+       copy    %sp, %r1
+
+       /* Regarding the HPPA calling conventions for function pointers,
+          we assume the PIC register is not changed across call.  For
+          CONFIG_64BIT, the argument pointer is left to point at the
+          argument region allocated for the call to call_on_stack. */
+# ifdef CONFIG_64BIT
+       /* Switch to new stack.  We allocate two 128 byte frames.  */
+       ldo     256(%arg2), %sp
+       /* Save previous stack pointer and return pointer in frame marker */
+       STREG   %rp, -144(%sp)
+       /* Calls always use function descriptor */
+       LDREG   16(%arg1), %arg1
+       bve,l   (%arg1), %rp
+       STREG   %r1, -136(%sp)
+       LDREG   -144(%sp), %rp
+       bve     (%rp)
+       LDREG   -136(%sp), %sp
+# else
+       /* Switch to new stack.  We allocate two 64 byte frames.  */
+       ldo     128(%arg2), %sp
+       /* Save previous stack pointer and return pointer in frame marker */
+       STREG   %r1, -68(%sp)
+       STREG   %rp, -84(%sp)
+       /* Calls use function descriptor if PLABEL bit is set */
+       bb,>=,n %arg1, 30, 1f
+       depwi   0,31,2, %arg1
+       LDREG   0(%arg1), %arg1
+1:
+       be,l    0(%sr4,%arg1), %sr0, %r31
+       copy    %r31, %rp
+       LDREG   -84(%sp), %rp
+       bv      (%rp)
+       LDREG   -68(%sp), %sp
+# endif /* CONFIG_64BIT */
+ENDPROC(call_on_stack)
+#endif /* CONFIG_IRQSTACKS */
  
  get_register:
         /*
diff --git a/arch/parisc/kernel/hpmc.S b/arch/parisc/kernel/hpmc.S

index 5595a2f311816d3d5e24420e5598563c4815d3a8..e158b6fbf1b472e2abd62277206518192f57b7ac 100644 (file)
--- a/arch/parisc/kernel/hpmc.S
+++ b/arch/parisc/kernel/hpmc.S
@@ -55,13 +55,13 @@
          * IODC requires 7K byte stack.  That leaves 1K byte for os_hpmc.
          */
  
-       .align  PAGE_SIZE
+       .align 4096
  hpmc_stack:
         .block 16384
  
  #define HPMC_IODC_BUF_SIZE 0x8000
  
-       .align  PAGE_SIZE
+       .align 4096
  hpmc_iodc_buf:
         .block HPMC_IODC_BUF_SIZE
  
diff --git a/arch/parisc/kernel/irq.c b/arch/parisc/kernel/irq.c

index 8094d3ed3b646328be21fd26738298a365089555..e255db0bb7619cf92e8581cae017f01f20c4718d 100644 (file)
--- a/arch/parisc/kernel/irq.c
+++ b/arch/parisc/kernel/irq.c
@@ -152,6 +152,39 @@ static struct irq_chip cpu_interrupt_type = {
         .irq_retrigger  = NULL,
  };
  
+DEFINE_PER_CPU_SHARED_ALIGNED(irq_cpustat_t, irq_stat);
+#define irq_stats(x)           (&per_cpu(irq_stat, x))
+
+/*
+ * /proc/interrupts printing for arch specific interrupts
+ */
+int arch_show_interrupts(struct seq_file *p, int prec)
+{
+       int j;
+
+#ifdef CONFIG_DEBUG_STACKOVERFLOW
+       seq_printf(p, "%*s: ", prec, "STK");
+       for_each_online_cpu(j)
+               seq_printf(p, "%10u ", irq_stats(j)->kernel_stack_usage);
+       seq_printf(p, "  Kernel stack usage\n");
+#endif
+#ifdef CONFIG_SMP
+       seq_printf(p, "%*s: ", prec, "RES");
+       for_each_online_cpu(j)
+               seq_printf(p, "%10u ", irq_stats(j)->irq_resched_count);
+       seq_printf(p, "  Rescheduling interrupts\n");
+       seq_printf(p, "%*s: ", prec, "CAL");
+       for_each_online_cpu(j)
+               seq_printf(p, "%10u ", irq_stats(j)->irq_call_count);
+       seq_printf(p, "  Function call interrupts\n");
+#endif
+       seq_printf(p, "%*s: ", prec, "TLB");
+       for_each_online_cpu(j)
+               seq_printf(p, "%10u ", irq_stats(j)->irq_tlb_count);
+       seq_printf(p, "  TLB shootdowns\n");
+       return 0;
+}
+
  int show_interrupts(struct seq_file *p, void *v)
  {
         int i = *(loff_t *) v, j;
@@ -219,6 +252,9 @@ int show_interrupts(struct seq_file *p, void *v)
                 raw_spin_unlock_irqrestore(&desc->lock, flags);
         }
  
+       if (i == NR_IRQS)
+               arch_show_interrupts(p, 3);
+
         return 0;
  }
  
@@ -330,6 +366,66 @@ static inline int eirr_to_irq(unsigned long eirr)
         return (BITS_PER_LONG - bit) + TIMER_IRQ;
  }
  
+int sysctl_panic_on_stackoverflow = 1;
+
+static inline void stack_overflow_check(struct pt_regs *regs)
+{
+#ifdef CONFIG_DEBUG_STACKOVERFLOW
+       #define STACK_MARGIN    (256*6)
+
+       /* Our stack starts directly behind the thread_info struct. */
+       unsigned long stack_start = (unsigned long) current_thread_info();
+       unsigned long sp = regs->gr[30];
+       unsigned long stack_usage;
+       unsigned int *last_usage;
+
+       /* if sr7 != 0, we interrupted a userspace process which we do not want
+        * to check for stack overflow. We will only check the kernel stack. */
+       if (regs->sr[7])
+               return;
+
+       /* calculate kernel stack usage */
+       stack_usage = sp - stack_start;
+       last_usage = &per_cpu(irq_stat.kernel_stack_usage, smp_processor_id());
+
+       if (unlikely(stack_usage > *last_usage))
+               *last_usage = stack_usage;
+
+       if (likely(stack_usage < (THREAD_SIZE - STACK_MARGIN)))
+               return;
+
+       pr_emerg("stackcheck: %s will most likely overflow kernel stack "
+                "(sp:%lx, stk bottom-top:%lx-%lx)\n",
+               current->comm, sp, stack_start, stack_start + THREAD_SIZE);
+
+       if (sysctl_panic_on_stackoverflow)
+               panic("low stack detected by irq handler - check messages\n");
+#endif
+}
+
+#ifdef CONFIG_IRQSTACKS
+DEFINE_PER_CPU(union irq_stack_union, irq_stack_union);
+
+static void execute_on_irq_stack(void *func, unsigned long param1)
+{
+       unsigned long *irq_stack_start;
+       unsigned long irq_stack;
+       int cpu = smp_processor_id();
+
+       irq_stack_start = &per_cpu(irq_stack_union, cpu).stack[0];
+       irq_stack = (unsigned long) irq_stack_start;
+       irq_stack = ALIGN(irq_stack, 16); /* align for stack frame usage */
+
+       BUG_ON(*irq_stack_start); /* report bug if we were called recursive. */
+       *irq_stack_start = 1;
+
+       /* This is where we switch to the IRQ stack. */
+       call_on_stack(param1, func, irq_stack);
+
+       *irq_stack_start = 0;
+}
+#endif /* CONFIG_IRQSTACKS */
+
  /* ONLY called from entry.S:intr_extint() */
  void do_cpu_irq_mask(struct pt_regs *regs)
  {
@@ -364,7 +460,13 @@ void do_cpu_irq_mask(struct pt_regs *regs)
                 goto set_out;
         }
  #endif
+       stack_overflow_check(regs);
+
+#ifdef CONFIG_IRQSTACKS
+       execute_on_irq_stack(&generic_handle_irq, irq);
+#else
         generic_handle_irq(irq);
+#endif /* CONFIG_IRQSTACKS */
  
   out:
         irq_exit();
@@ -420,6 +522,4 @@ void __init init_IRQ(void)
         cpu_eiem = EIEM_MASK(TIMER_IRQ);
  #endif
          set_eiem(cpu_eiem);    /* EIEM : enable all external intr */
-
  }
-
diff --git a/arch/parisc/kernel/pacache.S b/arch/parisc/kernel/pacache.S

index 312b48422a56c0c314b3d0f51b07ab95c3db8951..5e1de6072be57f0c92ce04950284fe7f10c5bf4c 100644 (file)
--- a/arch/parisc/kernel/pacache.S
+++ b/arch/parisc/kernel/pacache.S
@@ -563,6 +563,15 @@ ENDPROC(copy_page_asm)
   *          %r23 physical page (shifted for tlb insert) of "from" translation
   */
  
+        /* Drop prot bits and convert to page addr for iitlbt and idtlbt */
+        #define PAGE_ADD_SHIFT  (PAGE_SHIFT-12)
+        .macro          convert_phys_for_tlb_insert20  phys
+        extrd,u         \phys, 56-PAGE_ADD_SHIFT, 32-PAGE_ADD_SHIFT, \phys
+#if _PAGE_SIZE_ENCODING_DEFAULT
+        depdi           _PAGE_SIZE_ENCODING_DEFAULT, 63, (63-58), \phys
+#endif
+       .endm
+
         /*
          * We can't do this since copy_user_page is used to bring in
          * file data that might have instructions. Since the data would
@@ -589,15 +598,14 @@ ENTRY(copy_user_page_asm)
         sub             %r25, %r1, %r23
  
         ldil            L%(TMPALIAS_MAP_START), %r28
-       /* FIXME for different page sizes != 4k */
  #ifdef CONFIG_64BIT
  #if (TMPALIAS_MAP_START >= 0x80000000)
         depdi           0, 31,32, %r28          /* clear any sign extension */
  #endif
-       extrd,u         %r26,56,32, %r26        /* convert phys addr to tlb insert format */
-       extrd,u         %r23,56,32, %r23        /* convert phys addr to tlb insert format */
+       convert_phys_for_tlb_insert20 %r26      /* convert phys addr to tlb insert format */
+       convert_phys_for_tlb_insert20 %r23      /* convert phys addr to tlb insert format */
         depd            %r24,63,22, %r28        /* Form aliased virtual address 'to' */
-       depdi           0, 63,12, %r28          /* Clear any offset bits */
+       depdi           0, 63,PAGE_SHIFT, %r28  /* Clear any offset bits */
         copy            %r28, %r29
         depdi           1, 41,1, %r29           /* Form aliased virtual address 'from' */
  #else
@@ -747,11 +755,10 @@ ENTRY(clear_user_page_asm)
  #ifdef CONFIG_64BIT
  #if (TMPALIAS_MAP_START >= 0x80000000)
         depdi           0, 31,32, %r28          /* clear any sign extension */
-       /* FIXME: page size dependend */
  #endif
-       extrd,u         %r26, 56,32, %r26       /* convert phys addr to tlb insert format */
+       convert_phys_for_tlb_insert20 %r26      /* convert phys addr to tlb insert format */
         depd            %r25, 63,22, %r28       /* Form aliased virtual address 'to' */
-       depdi           0, 63,12, %r28          /* Clear any offset bits */
+       depdi           0, 63,PAGE_SHIFT, %r28  /* Clear any offset bits */
  #else
         extrw,u         %r26, 24,25, %r26       /* convert phys addr to tlb insert format */
         depw            %r25, 31,22, %r28       /* Form aliased virtual address 'to' */
@@ -832,11 +839,10 @@ ENTRY(flush_dcache_page_asm)
  #ifdef CONFIG_64BIT
  #if (TMPALIAS_MAP_START >= 0x80000000)
         depdi           0, 31,32, %r28          /* clear any sign extension */
-       /* FIXME: page size dependend */
  #endif
-       extrd,u         %r26, 56,32, %r26       /* convert phys addr to tlb insert format */
+       convert_phys_for_tlb_insert20 %r26      /* convert phys addr to tlb insert format */
         depd            %r25, 63,22, %r28       /* Form aliased virtual address 'to' */
-       depdi           0, 63,12, %r28          /* Clear any offset bits */
+       depdi           0, 63,PAGE_SHIFT, %r28  /* Clear any offset bits */
  #else
         extrw,u         %r26, 24,25, %r26       /* convert phys addr to tlb insert format */
         depw            %r25, 31,22, %r28       /* Form aliased virtual address 'to' */
@@ -909,11 +915,10 @@ ENTRY(flush_icache_page_asm)
  #ifdef CONFIG_64BIT
  #if (TMPALIAS_MAP_START >= 0x80000000)
         depdi           0, 31,32, %r28          /* clear any sign extension */
-       /* FIXME: page size dependend */
  #endif
-       extrd,u         %r26, 56,32, %r26       /* convert phys addr to tlb insert format */
+       convert_phys_for_tlb_insert20 %r26      /* convert phys addr to tlb insert format */
         depd            %r25, 63,22, %r28       /* Form aliased virtual address 'to' */
-       depdi           0, 63,12, %r28          /* Clear any offset bits */
+       depdi           0, 63,PAGE_SHIFT, %r28  /* Clear any offset bits */
  #else
         extrw,u         %r26, 24,25, %r26       /* convert phys addr to tlb insert format */
         depw            %r25, 31,22, %r28       /* Form aliased virtual address 'to' */
@@ -959,7 +964,7 @@ ENTRY(flush_icache_page_asm)
         fic,m           %r1(%sr4,%r28)
         fic,m           %r1(%sr4,%r28)
         fic,m           %r1(%sr4,%r28)
-       cmpb,COND(<<)           %r28, %r25,1b
+       cmpb,COND(<<)   %r28, %r25,1b
         fic,m           %r1(%sr4,%r28)
  
         sync
diff --git a/arch/parisc/kernel/setup.c b/arch/parisc/kernel/setup.c

index a3328c2616b0a6cf3f99eaff2fa2968cc58e1105..76b63e726a539ee912bea1077d00370603e1f538 100644 (file)
--- a/arch/parisc/kernel/setup.c
+++ b/arch/parisc/kernel/setup.c
@@ -129,6 +129,8 @@ void __init setup_arch(char **cmdline_p)
         printk(KERN_INFO "The 32-bit Kernel has started...\n");
  #endif
  
+       printk(KERN_INFO "Default page size is %dKB.\n", (int)(PAGE_SIZE / 1024));
+
         pdc_console_init();
  
  #ifdef CONFIG_64BIT
diff --git a/arch/parisc/kernel/smp.c b/arch/parisc/kernel/smp.c

index fd1bb1519c2b114be60f3d75089b7d81f29adbd7..e3614fb343e5eca796f919482e8873ae48e17c19 100644 (file)
--- a/arch/parisc/kernel/smp.c
+++ b/arch/parisc/kernel/smp.c
@@ -127,7 +127,7 @@ ipi_interrupt(int irq, void *dev_id)
         unsigned long flags;
  
         /* Count this now; we may make a call that never returns. */
-       p->ipi_count++;
+       inc_irq_stat(irq_call_count);
  
         mb();   /* Order interrupt and bit testing. */
  
@@ -155,6 +155,7 @@ ipi_interrupt(int irq, void *dev_id)
                                 
                         case IPI_RESCHEDULE:
                                 smp_debug(100, KERN_DEBUG "CPU%d IPI_RESCHEDULE\n", this_cpu);
+                               inc_irq_stat(irq_resched_count);
                                 scheduler_ipi();
                                 break;
  
@@ -262,17 +263,6 @@ void arch_send_call_function_single_ipi(int cpu)
         send_IPI_single(cpu, IPI_CALL_FUNC_SINGLE);
  }
  
-/*
- * Flush all other CPU's tlb and then mine.  Do this with on_each_cpu()
- * as we want to ensure all TLB's flushed before proceeding.
- */
-
-void
-smp_flush_tlb_all(void)
-{
-       on_each_cpu(flush_tlb_all_local, NULL, 1);
-}
-
  /*
   * Called by secondaries to update state and initialize CPU registers.
   */
diff --git a/arch/parisc/kernel/syscall.S b/arch/parisc/kernel/syscall.S

index 5e055240f00bb1f0b98f882db1f1ec5103e8c6ed..e767ab733e321e5619b919a9b8684ac70c35ba9b 100644 (file)
--- a/arch/parisc/kernel/syscall.S
+++ b/arch/parisc/kernel/syscall.S
@@ -1,12 +1,35 @@
  /* 
   * Linux/PA-RISC Project (http://www.parisc-linux.org/)
   * 
- * System call entry code Copyright (c) Matthew Wilcox 1999 <willy@bofh.ai>
+ * System call entry code / Linux gateway page
+ * Copyright (c) Matthew Wilcox 1999 <willy@bofh.ai>
   * Licensed under the GNU GPL.
   * thanks to Philipp Rumpf, Mike Shaver and various others
   * sorry about the wall, puffin..
   */
  
+/*
+How does the Linux gateway page on PA-RISC work?
+------------------------------------------------
+The Linux gateway page on PA-RISC is "special".
+It actually has PAGE_GATEWAY bits set (this is linux terminology; in parisc
+terminology it's Execute, promote to PL0) in the page map.  So anything
+executing on this page executes with kernel level privilege (there's more to it
+than that: to have this happen, you also have to use a branch with a ,gate
+completer to activate the privilege promotion).  The upshot is that everything
+that runs on the gateway page runs at kernel privilege but with the current
+user process address space (although you have access to kernel space via %sr2).
+For the 0x100 syscall entry, we redo the space registers to point to the kernel
+address space (preserving the user address space in %sr3), move to wide mode if
+required, save the user registers and branch into the kernel syscall entry
+point.  For all the other functions, we execute at kernel privilege but don't
+flip address spaces. The basic upshot of this is that these code snippets are
+executed atomically (because the kernel can't be pre-empted) and they may
+perform architecturally forbidden (to PL3) operations (like setting control
+registers).
+*/
+
+
  #include <asm/asm-offsets.h>
  #include <asm/unistd.h>
  #include <asm/errno.h>
@@ -15,6 +38,7 @@
  #include <asm/thread_info.h>
  #include <asm/assembly.h>
  #include <asm/processor.h>
+#include <asm/cache.h>
  
  #include <linux/linkage.h>
  
@@ -643,7 +667,7 @@ ENTRY(end_linux_gateway_page)
  
         .section .rodata,"a"
  
-       .align PAGE_SIZE
+       .align 8
         /* Light-weight-syscall table */
         /* Start of lws table. */
  ENTRY(lws_table)
@@ -652,13 +676,13 @@ ENTRY(lws_table)
  END(lws_table)
         /* End of lws table */
  
-       .align PAGE_SIZE
+       .align 8
  ENTRY(sys_call_table)
  #include "syscall_table.S"
  END(sys_call_table)
  
  #ifdef CONFIG_64BIT
-       .align PAGE_SIZE
+       .align 8
  ENTRY(sys_call_table64)
  #define SYSCALL_TABLE_64BIT
  #include "syscall_table.S"
@@ -674,7 +698,7 @@ END(sys_call_table64)
                 with ldcw.
         */
         .section .data
-       .align  PAGE_SIZE
+       .align  L1_CACHE_BYTES
  ENTRY(lws_lock_start)
         /* lws locks */
         .rept 16
diff --git a/arch/parisc/kernel/traps.c b/arch/parisc/kernel/traps.c

index f702bff0bed9de6c6d5485e7f8ddc87a0ad76490..fe41a98043bbcf287e3ee3562972d6b693e140d0 100644 (file)
--- a/arch/parisc/kernel/traps.c
+++ b/arch/parisc/kernel/traps.c
@@ -522,10 +522,10 @@ void notrace handle_interruption(int code, struct pt_regs *regs)
          */
         if (((unsigned long)regs->iaoq[0] & 3) &&
             ((unsigned long)regs->iasq[0] != (unsigned long)regs->sr[7])) { 
-               /* Kill the user process later */
-               regs->iaoq[0] = 0 | 3;
+               /* Kill the user process later */
+               regs->iaoq[0] = 0 | 3;
                 regs->iaoq[1] = regs->iaoq[0] + 4;
-               regs->iasq[0] = regs->iasq[1] = regs->sr[7];
+               regs->iasq[0] = regs->iasq[1] = regs->sr[7];
                 regs->gr[0] &= ~PSW_B;
                 return;
         }
@@ -541,8 +541,8 @@ void notrace handle_interruption(int code, struct pt_regs *regs)
                 
                 /* set up a new led state on systems shipped with a LED State panel */
                 pdc_chassis_send_status(PDC_CHASSIS_DIRECT_HPMC);
-                   
-               parisc_terminate("High Priority Machine Check (HPMC)",
+
+               parisc_terminate("High Priority Machine Check (HPMC)",
                                 regs, code, 0);
                 /* NOT REACHED */
                 
@@ -584,13 +584,13 @@ void notrace handle_interruption(int code, struct pt_regs *regs)
                 /* Break instruction trap */
                 handle_break(regs);
                 return;
-       
+
         case 10:
                 /* Privileged operation trap */
                 die_if_kernel("Privileged operation", regs, code);
                 si.si_code = ILL_PRVOPC;
                 goto give_sigill;
-       
+
         case 11:
                 /* Privileged register trap */
                 if ((regs->iir & 0xffdfffe0) == 0x034008a0) {
@@ -634,7 +634,7 @@ void notrace handle_interruption(int code, struct pt_regs *regs)
                 if(user_mode(regs)){
                         si.si_signo = SIGFPE;
                         /* Set to zero, and let the userspace app figure it out from
-                          the insn pointed to by si_addr */
+                          the insn pointed to by si_addr */
                         si.si_code = 0;
                         si.si_addr = (void __user *) regs->iaoq[0];
                         force_sig_info(SIGFPE, &si, current);
@@ -648,7 +648,7 @@ void notrace handle_interruption(int code, struct pt_regs *regs)
                 die_if_kernel("Floating point exception", regs, 0); /* quiet */
                 handle_fpe(regs);
                 return;
-               
+
         case 15:
                 /* Data TLB miss fault/Data page fault */
                 /* Fall through */
@@ -660,15 +660,15 @@ void notrace handle_interruption(int code, struct pt_regs *regs)
         case 17:
                 /* Non-access data TLB miss fault/Non-access data page fault */
                 /* FIXME: 
-                        Still need to add slow path emulation code here!
-                        If the insn used a non-shadow register, then the tlb
+                        Still need to add slow path emulation code here!
+                        If the insn used a non-shadow register, then the tlb
                          handlers could not have their side-effect (e.g. probe
                          writing to a target register) emulated since rfir would
                          erase the changes to said register. Instead we have to
                          setup everything, call this function we are in, and emulate
                          by hand. Technically we need to emulate:
                          fdc,fdce,pdc,"fic,4f",prober,probeir,probew, probeiw
-               */                        
+               */
                 fault_address = regs->ior;
                 fault_space = regs->isr;
                 break;
diff --git a/arch/parisc/kernel/vmlinux.lds.S b/arch/parisc/kernel/vmlinux.lds.S

index 64a999882e4fb8d0d584da223c7b1f43842e8d2c..4bb095a2f6fc2266388723cbb2634518a9570e44 100644 (file)
--- a/arch/parisc/kernel/vmlinux.lds.S
+++ b/arch/parisc/kernel/vmlinux.lds.S
@@ -95,7 +95,7 @@ SECTIONS
         NOTES
  
         /* Data */
-       RW_DATA_SECTION(L1_CACHE_BYTES, PAGE_SIZE, THREAD_SIZE)
+       RW_DATA_SECTION(L1_CACHE_BYTES, PAGE_SIZE, PAGE_SIZE)
  
         /* PA-RISC locks requires 16-byte alignment */
         . = ALIGN(16);
diff --git a/arch/parisc/mm/init.c b/arch/parisc/mm/init.c

index 157b931e7b09b89b60790b60aa016e5f5ac12913..ce939ac8622b84b7278f9e979e68a56354267adf 100644 (file)
--- a/arch/parisc/mm/init.c
+++ b/arch/parisc/mm/init.c
@@ -1069,6 +1069,7 @@ void flush_tlb_all(void)
  {
         int do_recycle;
  
+       inc_irq_stat(irq_tlb_count);
         do_recycle = 0;
         spin_lock(&sid_lock);
         if (dirty_space_ids > RECYCLE_THRESHOLD) {
@@ -1089,6 +1090,7 @@ void flush_tlb_all(void)
  #else
  void flush_tlb_all(void)
  {
+       inc_irq_stat(irq_tlb_count);
         spin_lock(&sid_lock);
         flush_tlb_all_local(NULL);
         recycle_sids();
diff --git a/arch/powerpc/Kconfig b/arch/powerpc/Kconfig

index bbbe02197afbdf151858beecc662adb7b5cddd0c..c33e3ad2c8fd52c9e0c31dfc272faf3d34902f37 100644 (file)
--- a/arch/powerpc/Kconfig
+++ b/arch/powerpc/Kconfig
@@ -82,11 +82,6 @@ config GENERIC_HWEIGHT
         bool
         default y
  
-config GENERIC_GPIO
-       bool
-       help
-         Generic GPIO API support
-
  config PPC
         bool
         default y
diff --git a/arch/powerpc/platforms/40x/Kconfig b/arch/powerpc/platforms/40x/Kconfig

index bd40bbb15e1476d72a487bcd83e62c8c04cdd155..6e287f1294fa3837a714d50816d3068527c94a9d 100644 (file)
--- a/arch/powerpc/platforms/40x/Kconfig
+++ b/arch/powerpc/platforms/40x/Kconfig
@@ -138,7 +138,6 @@ config PPC4xx_GPIO
         bool "PPC4xx GPIO support"
         depends on 40x
         select ARCH_REQUIRE_GPIOLIB
-       select GENERIC_GPIO
         help
           Enable gpiolib support for ppc40x based boards
  
diff --git a/arch/powerpc/platforms/44x/Kconfig b/arch/powerpc/platforms/44x/Kconfig

index 7be93367d92f4dccbb4f0c4eeea747b2b0334402..d6c7506ec7d9d2f84ad91d92e012229db29f2880 100644 (file)
--- a/arch/powerpc/platforms/44x/Kconfig
+++ b/arch/powerpc/platforms/44x/Kconfig
@@ -248,7 +248,6 @@ config PPC4xx_GPIO
         bool "PPC4xx GPIO support"
         depends on 44x
         select ARCH_REQUIRE_GPIOLIB
-       select GENERIC_GPIO
         help
           Enable gpiolib support for ppc440 based boards
  
diff --git a/arch/powerpc/platforms/85xx/Kconfig b/arch/powerpc/platforms/85xx/Kconfig

index 8f02b05f4c96f8f8ef868819bb6e82e203ab063a..efdd37c775ad40dd489f90c02a37c37d78ed3c5f 100644 (file)
--- a/arch/powerpc/platforms/85xx/Kconfig
+++ b/arch/powerpc/platforms/85xx/Kconfig
@@ -203,7 +203,6 @@ config GE_IMP3A
         select DEFAULT_UIMAGE
         select SWIOTLB
         select MMIO_NVRAM
-       select GENERIC_GPIO
         select ARCH_REQUIRE_GPIOLIB
         select GE_FPGA
         help
@@ -328,7 +327,7 @@ config B4_QDS
         select PPC_E500MC
         select PHYS_64BIT
         select SWIOTLB
-       select GENERIC_GPIO
+       select GPIOLIB
         select ARCH_REQUIRE_GPIOLIB
         select HAS_RAPIDIO
         select PPC_EPAPR_HV_PIC
diff --git a/arch/powerpc/platforms/86xx/Kconfig b/arch/powerpc/platforms/86xx/Kconfig

index 7a6279e38213e9c2444c4cb4ed67a118720390de..1afd1e4a2dd235d50733617b8c5089823b469dfd 100644 (file)
--- a/arch/powerpc/platforms/86xx/Kconfig
+++ b/arch/powerpc/platforms/86xx/Kconfig
@@ -37,7 +37,6 @@ config GEF_PPC9A
         bool "GE PPC9A"
         select DEFAULT_UIMAGE
         select MMIO_NVRAM
-       select GENERIC_GPIO
         select ARCH_REQUIRE_GPIOLIB
         select GE_FPGA
         help
@@ -47,7 +46,6 @@ config GEF_SBC310
         bool "GE SBC310"
         select DEFAULT_UIMAGE
         select MMIO_NVRAM
-       select GENERIC_GPIO
         select ARCH_REQUIRE_GPIOLIB
         select GE_FPGA
         help
@@ -57,7 +55,6 @@ config GEF_SBC610
         bool "GE SBC610"
         select DEFAULT_UIMAGE
         select MMIO_NVRAM
-       select GENERIC_GPIO
         select ARCH_REQUIRE_GPIOLIB
         select GE_FPGA
         select HAS_RAPIDIO
diff --git a/arch/powerpc/platforms/8xx/Kconfig b/arch/powerpc/platforms/8xx/Kconfig

index 1fb0b3cddeb35ee173c9022c25c4d60f8baea475..8dec3c0911ad0f4ad047b6d68ce8a34b95452cfe 100644 (file)
--- a/arch/powerpc/platforms/8xx/Kconfig
+++ b/arch/powerpc/platforms/8xx/Kconfig
@@ -114,7 +114,6 @@ config 8xx_COPYBACK
  
  config 8xx_GPIO
         bool "GPIO API Support"
-       select GENERIC_GPIO
         select ARCH_REQUIRE_GPIOLIB
         help
           Saying Y here will cause the ports on an MPC8xx processor to be used
diff --git a/arch/powerpc/platforms/Kconfig b/arch/powerpc/platforms/Kconfig

index 34d224be93ba223fcd48b45e32a165281f409907..a881232a3cce1bdafee6f5f2a6536a89afe188ab 100644 (file)
--- a/arch/powerpc/platforms/Kconfig
+++ b/arch/powerpc/platforms/Kconfig
@@ -302,7 +302,6 @@ config QUICC_ENGINE
  config QE_GPIO
         bool "QE GPIO support"
         depends on QUICC_ENGINE
-       select GENERIC_GPIO
         select ARCH_REQUIRE_GPIOLIB
         help
           Say Y here if you're going to use hardware that connects to the
@@ -315,7 +314,6 @@ config CPM2
         select PPC_LIB_RHEAP
         select PPC_PCI_CHOICE
         select ARCH_REQUIRE_GPIOLIB
-       select GENERIC_GPIO
         help
           The CPM2 (Communications Processor Module) is a coprocessor on
           embedded CPUs made by Freescale.  Selecting this option means that
@@ -353,7 +351,6 @@ config OF_RTC
  config SIMPLE_GPIO
         bool "Support for simple, memory-mapped GPIO controllers"
         depends on PPC
-       select GENERIC_GPIO
         select ARCH_REQUIRE_GPIOLIB
         help
           Say Y here to support simple, memory-mapped GPIO controllers.
@@ -364,7 +361,6 @@ config SIMPLE_GPIO
  config MCU_MPC8349EMITX
         bool "MPC8349E-mITX MCU driver"
         depends on I2C=y && PPC_83xx
-       select GENERIC_GPIO
         select ARCH_REQUIRE_GPIOLIB
         help
           Say Y here to enable soft power-off functionality on the Freescale
diff --git a/arch/s390/hypfs/inode.c b/arch/s390/hypfs/inode.c

index 5f7d7ba2874c8a6fa14b49bcefc7ef09ef59dc6b..7a539f4f5e30b776527c5426eabfbefead4a0f34 100644 (file)
--- a/arch/s390/hypfs/inode.c
+++ b/arch/s390/hypfs/inode.c
@@ -21,6 +21,7 @@
  #include <linux/module.h>
  #include <linux/seq_file.h>
  #include <linux/mount.h>
+#include <linux/aio.h>
  #include <asm/ebcdic.h>
  #include "hypfs.h"
  
diff --git a/arch/sh/Kconfig b/arch/sh/Kconfig

index 78d8ace5727253ef135dbc8adb046caa7323a656..8c868cf2cf93febf478cc199e2a996ff64b244d1 100644 (file)
--- a/arch/sh/Kconfig
+++ b/arch/sh/Kconfig
@@ -93,9 +93,6 @@ config GENERIC_CSUM
  config GENERIC_HWEIGHT
         def_bool y
  
-config GENERIC_GPIO
-       def_bool n
-
  config GENERIC_CALIBRATE_DELAY
         bool
  
diff --git a/arch/sh/boards/mach-sdk7786/Makefile b/arch/sh/boards/mach-sdk7786/Makefile

index 8ae56e9560aca1d6bfebefb3ed0e4a9e35b95677..45d32e3590b9715dc6d3ec08ce41f64123203a35 100644 (file)
--- a/arch/sh/boards/mach-sdk7786/Makefile
+++ b/arch/sh/boards/mach-sdk7786/Makefile
@@ -1,4 +1,4 @@
  obj-y  := fpga.o irq.o nmi.o setup.o
  
-obj-$(CONFIG_GENERIC_GPIO)     += gpio.o
+obj-$(CONFIG_GPIOLIB)          += gpio.o
  obj-$(CONFIG_HAVE_SRAM_POOL)   += sram.o
diff --git a/arch/sh/boards/mach-x3proto/Makefile b/arch/sh/boards/mach-x3proto/Makefile

index 708c21c919ff8c46dc71222afaf668d11b31738c..0cbe3d02dea30c6ddd476b153edee195401e82c4 100644 (file)
--- a/arch/sh/boards/mach-x3proto/Makefile
+++ b/arch/sh/boards/mach-x3proto/Makefile
@@ -1,3 +1,3 @@
  obj-y += setup.o ilsel.o
  
-obj-$(CONFIG_GENERIC_GPIO)     += gpio.o
+obj-$(CONFIG_GPIOLIB)          += gpio.o
diff --git a/arch/sh/kernel/cpu/sh2a/Makefile b/arch/sh/kernel/cpu/sh2a/Makefile

index 7fdc102d0dd65572149abfface163e58468f6512..990195d9845607bfcca4a8b3cfcd05c41df582bc 100644 (file)
--- a/arch/sh/kernel/cpu/sh2a/Makefile
+++ b/arch/sh/kernel/cpu/sh2a/Makefile
@@ -21,4 +21,4 @@ pinmux-$(CONFIG_CPU_SUBTYPE_SH7203)   := pinmux-sh7203.o
  pinmux-$(CONFIG_CPU_SUBTYPE_SH7264)    := pinmux-sh7264.o
  pinmux-$(CONFIG_CPU_SUBTYPE_SH7269)    := pinmux-sh7269.o
  
-obj-$(CONFIG_GENERIC_GPIO)     += $(pinmux-y)
+obj-$(CONFIG_GPIOLIB)                  += $(pinmux-y)
diff --git a/arch/sh/kernel/cpu/sh3/Makefile b/arch/sh/kernel/cpu/sh3/Makefile

index 6f13f33a35ffce3d0877deff87c3547a74754e60..d3634ae7b71a96efa9dc414a43bfe4bb2013a3ca 100644 (file)
--- a/arch/sh/kernel/cpu/sh3/Makefile
+++ b/arch/sh/kernel/cpu/sh3/Makefile
@@ -30,4 +30,4 @@ clock-$(CONFIG_CPU_SUBTYPE_SH7712)    := clock-sh7712.o
  pinmux-$(CONFIG_CPU_SUBTYPE_SH7720)    := pinmux-sh7720.o
  
  obj-y  += $(clock-y)
-obj-$(CONFIG_GENERIC_GPIO)     += $(pinmux-y)
+obj-$(CONFIG_GPIOLIB)                  += $(pinmux-y)
diff --git a/arch/sh/kernel/cpu/sh4a/Makefile b/arch/sh/kernel/cpu/sh4a/Makefile

index 8fc6ec2be2fa6076ed8644fce2cb126342239449..0705df775208907b51894732a87eeba44b928e70 100644 (file)
--- a/arch/sh/kernel/cpu/sh4a/Makefile
+++ b/arch/sh/kernel/cpu/sh4a/Makefile
@@ -47,6 +47,6 @@ pinmux-$(CONFIG_CPU_SUBTYPE_SHX3)     := pinmux-shx3.o
  
  obj-y                                  += $(clock-y)
  obj-$(CONFIG_SMP)                      += $(smp-y)
-obj-$(CONFIG_GENERIC_GPIO)             += $(pinmux-y)
+obj-$(CONFIG_GPIOLIB)                  += $(pinmux-y)
  obj-$(CONFIG_PERF_EVENTS)              += perf_event.o
  obj-$(CONFIG_HAVE_HW_BREAKPOINT)       += ubc.o
diff --git a/arch/sparc/Kconfig b/arch/sparc/Kconfig

index a639c0d07b8ba952f23f8f11930cd927e93da50d..9ac9f1666339015c3220631a4efe43226844b6d3 100644 (file)
--- a/arch/sparc/Kconfig
+++ b/arch/sparc/Kconfig
@@ -137,11 +137,6 @@ config GENERIC_ISA_DMA
         bool
         default y if SPARC32
  
-config GENERIC_GPIO
-       bool
-       help
-         Generic GPIO API support
-
  config ARCH_SUPPORTS_DEBUG_PAGEALLOC
         def_bool y if SPARC64
  
diff --git a/arch/sparc/kernel/leon_smp.c b/arch/sparc/kernel/leon_smp.c

index 9b40c9c12a0c986534a722858a2219bb8f87614a..6cfc1b09ec25a28a683a76aa6d2ebb4e1d288a2d 100644 (file)
--- a/arch/sparc/kernel/leon_smp.c
+++ b/arch/sparc/kernel/leon_smp.c
@@ -253,24 +253,15 @@ void __init leon_smp_done(void)
  
         /* Free unneeded trap tables */
         if (!cpu_present(1)) {
-               ClearPageReserved(virt_to_page(&trapbase_cpu1));
-               init_page_count(virt_to_page(&trapbase_cpu1));
-               free_page((unsigned long)&trapbase_cpu1);
-               totalram_pages++;
+               free_reserved_page(virt_to_page(&trapbase_cpu1));
                 num_physpages++;
         }
         if (!cpu_present(2)) {
-               ClearPageReserved(virt_to_page(&trapbase_cpu2));
-               init_page_count(virt_to_page(&trapbase_cpu2));
-               free_page((unsigned long)&trapbase_cpu2);
-               totalram_pages++;
+               free_reserved_page(virt_to_page(&trapbase_cpu2));
                 num_physpages++;
         }
         if (!cpu_present(3)) {
-               ClearPageReserved(virt_to_page(&trapbase_cpu3));
-               init_page_count(virt_to_page(&trapbase_cpu3));
-               free_page((unsigned long)&trapbase_cpu3);
-               totalram_pages++;
+               free_reserved_page(virt_to_page(&trapbase_cpu3));
                 num_physpages++;
         }
         /* Ok, they are spinning and ready to go. */
diff --git a/arch/sparc/mm/init_32.c b/arch/sparc/mm/init_32.c

index 4490c397bb5b6216ef697814fcb381289e8a96fb..af472cf7c69a1998ab1b16dcba9edb884ef7e743 100644 (file)
--- a/arch/sparc/mm/init_32.c
+++ b/arch/sparc/mm/init_32.c
@@ -366,45 +366,14 @@ void __init mem_init(void)
  
  void free_initmem (void)
  {
-       unsigned long addr;
-       unsigned long freed;
-
-       addr = (unsigned long)(&__init_begin);
-       freed = (unsigned long)(&__init_end) - addr;
-       for (; addr < (unsigned long)(&__init_end); addr += PAGE_SIZE) {
-               struct page *p;
-
-               memset((void *)addr, POISON_FREE_INITMEM, PAGE_SIZE);
-               p = virt_to_page(addr);
-
-               ClearPageReserved(p);
-               init_page_count(p);
-               __free_page(p);
-               totalram_pages++;
-               num_physpages++;
-       }
-       printk(KERN_INFO "Freeing unused kernel memory: %ldk freed\n",
-               freed >> 10);
+       num_physpages += free_initmem_default(POISON_FREE_INITMEM);
  }
  
  #ifdef CONFIG_BLK_DEV_INITRD
  void free_initrd_mem(unsigned long start, unsigned long end)
  {
-       if (start < end)
-               printk(KERN_INFO "Freeing initrd memory: %ldk freed\n",
-                       (end - start) >> 10);
-       for (; start < end; start += PAGE_SIZE) {
-               struct page *p;
-
-               memset((void *)start, POISON_FREE_INITMEM, PAGE_SIZE);
-               p = virt_to_page(start);
-
-               ClearPageReserved(p);
-               init_page_count(p);
-               __free_page(p);
-               totalram_pages++;
-               num_physpages++;
-       }
+       num_physpages += free_reserved_area(start, end, POISON_FREE_INITMEM,
+                                           "initrd");
  }
  #endif
  
diff --git a/arch/sparc/mm/init_64.c b/arch/sparc/mm/init_64.c

index cf72a8a5b3aab58e0d741cc739bed85ace8f758f..a7171997adfdd4b7b5bf9154528b94c31b0a3bb3 100644 (file)
--- a/arch/sparc/mm/init_64.c
+++ b/arch/sparc/mm/init_64.c
@@ -2059,8 +2059,7 @@ void __init mem_init(void)
         /* We subtract one to account for the mem_map_zero page
          * allocated below.
          */
-       totalram_pages -= 1;
-       num_physpages = totalram_pages;
+       num_physpages = totalram_pages - 1;
  
         /*
          * Set up the zero page, mark it reserved, so that page count
@@ -2071,7 +2070,7 @@ void __init mem_init(void)
                 prom_printf("paging_init: Cannot alloc zero page.\n");
                 prom_halt();
         }
-       SetPageReserved(mem_map_zero);
+       mark_page_reserved(mem_map_zero);
  
         codepages = (((unsigned long) _etext) - ((unsigned long) _start));
         codepages = PAGE_ALIGN(codepages) >> PAGE_SHIFT;
@@ -2111,37 +2110,22 @@ void free_initmem(void)
         initend = (unsigned long)(__init_end) & PAGE_MASK;
         for (; addr < initend; addr += PAGE_SIZE) {
                 unsigned long page;
-               struct page *p;
  
                 page = (addr +
                         ((unsigned long) __va(kern_base)) -
                         ((unsigned long) KERNBASE));
                 memset((void *)addr, POISON_FREE_INITMEM, PAGE_SIZE);
  
-               if (do_free) {
-                       p = virt_to_page(page);
-
-                       ClearPageReserved(p);
-                       init_page_count(p);
-                       __free_page(p);
-                       totalram_pages++;
-               }
+               if (do_free)
+                       free_reserved_page(virt_to_page(page));
         }
  }
  
  #ifdef CONFIG_BLK_DEV_INITRD
  void free_initrd_mem(unsigned long start, unsigned long end)
  {
-       if (start < end)
-               printk ("Freeing initrd memory: %ldk freed\n", (end - start) >> 10);
-       for (; start < end; start += PAGE_SIZE) {
-               struct page *p = virt_to_page(start);
-
-               ClearPageReserved(p);
-               init_page_count(p);
-               __free_page(p);
-               totalram_pages++;
-       }
+       num_physpages += free_reserved_area(start, end, POISON_FREE_INITMEM,
+                                           "initrd");
  }
  #endif
  
diff --git a/arch/um/drivers/ubd_kern.c b/arch/um/drivers/ubd_kern.c

index 41bf72073cccafd9bbd918ded89e6263d6883b93..879990cb66c6264e4db70e4ed812a1710ea6b1df 100644 (file)
--- a/arch/um/drivers/ubd_kern.c
+++ b/arch/um/drivers/ubd_kern.c
@@ -87,7 +87,7 @@ static DEFINE_MUTEX(ubd_lock);
  static DEFINE_MUTEX(ubd_mutex); /* replaces BKL, might not be needed */
  
  static int ubd_open(struct block_device *bdev, fmode_t mode);
-static int ubd_release(struct gendisk *disk, fmode_t mode);
+static void ubd_release(struct gendisk *disk, fmode_t mode);
  static int ubd_ioctl(struct block_device *bdev, fmode_t mode,
                      unsigned int cmd, unsigned long arg);
  static int ubd_getgeo(struct block_device *bdev, struct hd_geometry *geo);
@@ -1138,7 +1138,7 @@ out:
         return err;
  }
  
-static int ubd_release(struct gendisk *disk, fmode_t mode)
+static void ubd_release(struct gendisk *disk, fmode_t mode)
  {
         struct ubd *ubd_dev = disk->private_data;
  
@@ -1146,7 +1146,6 @@ static int ubd_release(struct gendisk *disk, fmode_t mode)
         if(--ubd_dev->count == 0)
                 ubd_close_dev(ubd_dev);
         mutex_unlock(&ubd_mutex);
-       return 0;
  }
  
  static void cowify_bitmap(__u64 io_offset, int length, unsigned long *cow_mask,
diff --git a/arch/unicore32/Kconfig b/arch/unicore32/Kconfig

index 2943e3acdf0cba611bcd0fc222fbc59fb3f6602a..41bcc001344201b7487fc339264c1a513026a5dc 100644 (file)
--- a/arch/unicore32/Kconfig
+++ b/arch/unicore32/Kconfig
@@ -23,9 +23,6 @@ config UNICORE32
           designs licensed by PKUnity Ltd.
           Please see web page at <http://www.pkunity.com/>.
  
-config GENERIC_GPIO
-       def_bool y
-
  config GENERIC_CSUM
         def_bool y
  
@@ -156,7 +153,7 @@ source "mm/Kconfig"
  
  config LEDS
         def_bool y
-       depends on GENERIC_GPIO
+       depends on GPIOLIB
  
  config ALIGNMENT_TRAP
         def_bool y
@@ -219,7 +216,6 @@ if ARCH_PUV3
  config PUV3_GPIO
         bool
         depends on !ARCH_FPGA
-       select GENERIC_GPIO
         select GPIO_SYSFS
         default y
  
diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig

index 5db2117ae28830757d88db9ad25e54c224332e50..6a154a91c7e746342f35cf6aa13bdae24f544a88 100644 (file)
--- a/arch/x86/Kconfig
+++ b/arch/x86/Kconfig
@@ -174,9 +174,6 @@ config GENERIC_BUG_RELATIVE_POINTERS
  config GENERIC_HWEIGHT
         def_bool y
  
-config GENERIC_GPIO
-       bool
-
  config ARCH_MAY_HAVE_PC_FDC
         def_bool y
         depends on ISA_DMA_API
diff --git a/arch/x86/pci/mrst.c b/arch/x86/pci/mrst.c

index 6eb18c42a28a3584546e87a57f9cccf5af3473e3..0e0fabf173429006612eb014ec591df98e4b2deb 100644 (file)
--- a/arch/x86/pci/mrst.c
+++ b/arch/x86/pci/mrst.c
@@ -141,6 +141,11 @@ static int pci_device_update_fixed(struct pci_bus *bus, unsigned int devfn,
   */
  static bool type1_access_ok(unsigned int bus, unsigned int devfn, int reg)
  {
+       if (bus == 0 && (devfn == PCI_DEVFN(2, 0)
+                               || devfn == PCI_DEVFN(0, 0)
+                               || devfn == PCI_DEVFN(3, 0)))
+               return 1;
+
         /* This is a workaround for A0 LNC bug where PCI status register does
          * not have new CAP bit set. can not be written by SW either.
          *
@@ -150,10 +155,7 @@ static bool type1_access_ok(unsigned int bus, unsigned int devfn, int reg)
          */
         if (reg >= 0x100 || reg == PCI_STATUS || reg == PCI_HEADER_TYPE)
                 return 0;
-       if (bus == 0 && (devfn == PCI_DEVFN(2, 0)
-                               || devfn == PCI_DEVFN(0, 0)
-                               || devfn == PCI_DEVFN(3, 0)))
-               return 1;
+
         return 0; /* langwell on others */
  }
  
diff --git a/arch/xtensa/Kconfig b/arch/xtensa/Kconfig

index b09de49dbec5cf0a7d65590afc7c2a97158399bf..acdfc615cca2a5f2a729c9fdea9480abf2d901af 100644 (file)
--- a/arch/xtensa/Kconfig
+++ b/arch/xtensa/Kconfig
@@ -33,9 +33,6 @@ config RWSEM_XCHGADD_ALGORITHM
  config GENERIC_HWEIGHT
         def_bool y
  
-config GENERIC_GPIO
-       bool
-
  config ARCH_HAS_ILOG2_U32
         def_bool n
  
diff --git a/arch/xtensa/configs/iss_defconfig b/arch/xtensa/configs/iss_defconfig

index ddab37b24741f03e901d2078911d4c576734f06d..77c52f80187a220cc1b6982d235d0055a779f208 100644 (file)
--- a/arch/xtensa/configs/iss_defconfig
+++ b/arch/xtensa/configs/iss_defconfig
@@ -10,7 +10,6 @@ CONFIG_RWSEM_XCHGADD_ALGORITHM=y
  CONFIG_GENERIC_FIND_NEXT_BIT=y
  CONFIG_GENERIC_HWEIGHT=y
  CONFIG_GENERIC_HARDIRQS=y
-CONFIG_GENERIC_GPIO=y
  # CONFIG_ARCH_HAS_ILOG2_U32 is not set
  # CONFIG_ARCH_HAS_ILOG2_U64 is not set
  CONFIG_NO_IOPORT=y
diff --git a/arch/xtensa/configs/s6105_defconfig b/arch/xtensa/configs/s6105_defconfig

index eaf1b8fc655652a11f0aaa8ab479e49f07c064bc..4799c6a526b582658af0ebf34ffc08132b42dacf 100644 (file)
--- a/arch/xtensa/configs/s6105_defconfig
+++ b/arch/xtensa/configs/s6105_defconfig
@@ -10,7 +10,6 @@ CONFIG_RWSEM_XCHGADD_ALGORITHM=y
  CONFIG_GENERIC_FIND_NEXT_BIT=y
  CONFIG_GENERIC_HWEIGHT=y
  CONFIG_GENERIC_HARDIRQS=y
-CONFIG_GENERIC_GPIO=y
  # CONFIG_ARCH_HAS_ILOG2_U32 is not set
  # CONFIG_ARCH_HAS_ILOG2_U64 is not set
  CONFIG_NO_IOPORT=y
diff --git a/arch/xtensa/platforms/iss/simdisk.c b/arch/xtensa/platforms/iss/simdisk.c

index 88608cc11b8cae55169bc06af3a8dd962d4ae0dc..0345f43d34f365c98e73bbed0659dc62911bbce2 100644 (file)
--- a/arch/xtensa/platforms/iss/simdisk.c
+++ b/arch/xtensa/platforms/iss/simdisk.c
@@ -139,13 +139,12 @@ static int simdisk_open(struct block_device *bdev, fmode_t mode)
         return 0;
  }
  
-static int simdisk_release(struct gendisk *disk, fmode_t mode)
+static void simdisk_release(struct gendisk *disk, fmode_t mode)
  {
         struct simdisk *dev = disk->private_data;
         spin_lock(&dev->lock);
         --dev->users;
         spin_unlock(&dev->lock);
-       return 0;
  }
  
  static const struct block_device_operations simdisk_ops = {
diff --git a/block/blk-cgroup.c b/block/blk-cgroup.c

index b2b9837f9dd3475be841d41ed52dd7715e936d4b..e8918ffaf96d4a0a2dacf75838b5d8a89e5e8ca3 100644 (file)
--- a/block/blk-cgroup.c
+++ b/block/blk-cgroup.c
@@ -972,10 +972,10 @@ int blkcg_activate_policy(struct request_queue *q,
         if (!new_blkg)
                 return -ENOMEM;
  
-       preloaded = !radix_tree_preload(GFP_KERNEL);
-
         blk_queue_bypass_start(q);
  
+       preloaded = !radix_tree_preload(GFP_KERNEL);
+
         /*
          * Make sure the root blkg exists and count the existing blkgs.  As
          * @q is bypassing at this point, blkg_lookup_create() can't be
diff --git a/block/blk-core.c b/block/blk-core.c

index 7c288358a745ad2312e93080201e341cf2b69207..33c33bc99ddd5546e6ba30ce267cb436d0328c51 100644 (file)
--- a/block/blk-core.c
+++ b/block/blk-core.c
@@ -30,6 +30,7 @@
  #include <linux/list_sort.h>
  #include <linux/delay.h>
  #include <linux/ratelimit.h>
+#include <linux/pm_runtime.h>
  
  #define CREATE_TRACE_POINTS
  #include <trace/events/block.h>
@@ -159,20 +160,10 @@ static void req_bio_endio(struct request *rq, struct bio *bio,
         else if (!test_bit(BIO_UPTODATE, &bio->bi_flags))
                 error = -EIO;
  
-       if (unlikely(nbytes > bio->bi_size)) {
-               printk(KERN_ERR "%s: want %u bytes done, %u left\n",
-                      __func__, nbytes, bio->bi_size);
-               nbytes = bio->bi_size;
-       }
-
         if (unlikely(rq->cmd_flags & REQ_QUIET))
                 set_bit(BIO_QUIET, &bio->bi_flags);
  
-       bio->bi_size -= nbytes;
-       bio->bi_sector += (nbytes >> 9);
-
-       if (bio_integrity(bio))
-               bio_integrity_advance(bio, nbytes);
+       bio_advance(bio, nbytes);
  
         /* don't actually finish bio if it's part of flush sequence */
         if (bio->bi_size == 0 && !(rq->cmd_flags & REQ_FLUSH_SEQ))
@@ -1264,6 +1255,16 @@ void part_round_stats(int cpu, struct hd_struct *part)
  }
  EXPORT_SYMBOL_GPL(part_round_stats);
  
+#ifdef CONFIG_PM_RUNTIME
+static void blk_pm_put_request(struct request *rq)
+{
+       if (rq->q->dev && !(rq->cmd_flags & REQ_PM) && !--rq->q->nr_pending)
+               pm_runtime_mark_last_busy(rq->q->dev);
+}
+#else
+static inline void blk_pm_put_request(struct request *rq) {}
+#endif
+
  /*
   * queue lock must be held
   */
@@ -1274,6 +1275,8 @@ void __blk_put_request(struct request_queue *q, struct request *req)
         if (unlikely(--req->ref_count))
                 return;
  
+       blk_pm_put_request(req);
+
         elv_completed_request(q, req);
  
         /* this is a bio leak */
@@ -1597,7 +1600,7 @@ static void handle_bad_sector(struct bio *bio)
         printk(KERN_INFO "%s: rw=%ld, want=%Lu, limit=%Lu\n",
                         bdevname(bio->bi_bdev, b),
                         bio->bi_rw,
-                       (unsigned long long)bio->bi_sector + bio_sectors(bio),
+                       (unsigned long long)bio_end_sector(bio),
                         (long long)(i_size_read(bio->bi_bdev->bd_inode) >> 9));
  
         set_bit(BIO_EOF, &bio->bi_flags);
@@ -2053,6 +2056,28 @@ static void blk_account_io_done(struct request *req)
         }
  }
  
+#ifdef CONFIG_PM_RUNTIME
+/*
+ * Don't process normal requests when queue is suspended
+ * or in the process of suspending/resuming
+ */
+static struct request *blk_pm_peek_request(struct request_queue *q,
+                                          struct request *rq)
+{
+       if (q->dev && (q->rpm_status == RPM_SUSPENDED ||
+           (q->rpm_status != RPM_ACTIVE && !(rq->cmd_flags & REQ_PM))))
+               return NULL;
+       else
+               return rq;
+}
+#else
+static inline struct request *blk_pm_peek_request(struct request_queue *q,
+                                                 struct request *rq)
+{
+       return rq;
+}
+#endif
+
  /**
   * blk_peek_request - peek at the top of a request queue
   * @q: request queue to peek at
@@ -2075,6 +2100,11 @@ struct request *blk_peek_request(struct request_queue *q)
         int ret;
  
         while ((rq = __elv_next_request(q)) != NULL) {
+
+               rq = blk_pm_peek_request(q, rq);
+               if (!rq)
+                       break;
+
                 if (!(rq->cmd_flags & REQ_STARTED)) {
                         /*
                          * This is the first time the device driver
@@ -2253,8 +2283,7 @@ EXPORT_SYMBOL(blk_fetch_request);
   **/
  bool blk_update_request(struct request *req, int error, unsigned int nr_bytes)
  {
-       int total_bytes, bio_nbytes, next_idx = 0;
-       struct bio *bio;
+       int total_bytes;
  
         if (!req->bio)
                 return false;
@@ -2300,56 +2329,21 @@ bool blk_update_request(struct request *req, int error, unsigned int nr_bytes)
  
         blk_account_io_completion(req, nr_bytes);
  
-       total_bytes = bio_nbytes = 0;
-       while ((bio = req->bio) != NULL) {
-               int nbytes;
+       total_bytes = 0;
+       while (req->bio) {
+               struct bio *bio = req->bio;
+               unsigned bio_bytes = min(bio->bi_size, nr_bytes);
  
-               if (nr_bytes >= bio->bi_size) {
+               if (bio_bytes == bio->bi_size)
                         req->bio = bio->bi_next;
-                       nbytes = bio->bi_size;
-                       req_bio_endio(req, bio, nbytes, error);
-                       next_idx = 0;
-                       bio_nbytes = 0;
-               } else {
-                       int idx = bio->bi_idx + next_idx;
  
-                       if (unlikely(idx >= bio->bi_vcnt)) {
-                               blk_dump_rq_flags(req, "__end_that");
-                               printk(KERN_ERR "%s: bio idx %d >= vcnt %d\n",
-                                      __func__, idx, bio->bi_vcnt);
-                               break;
-                       }
+               req_bio_endio(req, bio, bio_bytes, error);
  
-                       nbytes = bio_iovec_idx(bio, idx)->bv_len;
-                       BIO_BUG_ON(nbytes > bio->bi_size);
+               total_bytes += bio_bytes;
+               nr_bytes -= bio_bytes;
  
-                       /*
-                        * not a complete bvec done
-                        */
-                       if (unlikely(nbytes > nr_bytes)) {
-                               bio_nbytes += nr_bytes;
-                               total_bytes += nr_bytes;
-                               break;
-                       }
-
-                       /*
-                        * advance to the next vector
-                        */
-                       next_idx++;
-                       bio_nbytes += nbytes;
-               }
-
-               total_bytes += nbytes;
-               nr_bytes -= nbytes;
-
-               bio = req->bio;
-               if (bio) {
-                       /*
-                        * end more in this run, or just return 'not-done'
-                        */
-                       if (unlikely(nr_bytes <= 0))
-                               break;
-               }
+               if (!nr_bytes)
+                       break;
         }
  
         /*
@@ -2365,16 +2359,6 @@ bool blk_update_request(struct request *req, int error, unsigned int nr_bytes)
                 return false;
         }
  
-       /*
-        * if the request wasn't completed, update state
-        */
-       if (bio_nbytes) {
-               req_bio_endio(req, bio, bio_nbytes, error);
-               bio->bi_idx += next_idx;
-               bio_iovec(bio)->bv_offset += nr_bytes;
-               bio_iovec(bio)->bv_len -= nr_bytes;
-       }
-
         req->__data_len -= total_bytes;
         req->buffer = bio_data(req->bio);
  
@@ -3046,6 +3030,149 @@ void blk_finish_plug(struct blk_plug *plug)
  }
  EXPORT_SYMBOL(blk_finish_plug);
  
+#ifdef CONFIG_PM_RUNTIME
+/**
+ * blk_pm_runtime_init - Block layer runtime PM initialization routine
+ * @q: the queue of the device
+ * @dev: the device the queue belongs to
+ *
+ * Description:
+ *    Initialize runtime-PM-related fields for @q and start auto suspend for
+ *    @dev. Drivers that want to take advantage of request-based runtime PM
+ *    should call this function after @dev has been initialized, and its
+ *    request queue @q has been allocated, and runtime PM for it can not happen
+ *    yet(either due to disabled/forbidden or its usage_count > 0). In most
+ *    cases, driver should call this function before any I/O has taken place.
+ *
+ *    This function takes care of setting up using auto suspend for the device,
+ *    the autosuspend delay is set to -1 to make runtime suspend impossible
+ *    until an updated value is either set by user or by driver. Drivers do
+ *    not need to touch other autosuspend settings.
+ *
+ *    The block layer runtime PM is request based, so only works for drivers
+ *    that use request as their IO unit instead of those directly use bio's.
+ */
+void blk_pm_runtime_init(struct request_queue *q, struct device *dev)
+{
+       q->dev = dev;
+       q->rpm_status = RPM_ACTIVE;
+       pm_runtime_set_autosuspend_delay(q->dev, -1);
+       pm_runtime_use_autosuspend(q->dev);
+}
+EXPORT_SYMBOL(blk_pm_runtime_init);
+
+/**
+ * blk_pre_runtime_suspend - Pre runtime suspend check
+ * @q: the queue of the device
+ *
+ * Description:
+ *    This function will check if runtime suspend is allowed for the device
+ *    by examining if there are any requests pending in the queue. If there
+ *    are requests pending, the device can not be runtime suspended; otherwise,
+ *    the queue's status will be updated to SUSPENDING and the driver can
+ *    proceed to suspend the device.
+ *
+ *    For the not allowed case, we mark last busy for the device so that
+ *    runtime PM core will try to autosuspend it some time later.
+ *
+ *    This function should be called near the start of the device's
+ *    runtime_suspend callback.
+ *
+ * Return:
+ *    0                - OK to runtime suspend the device
+ *    -EBUSY   - Device should not be runtime suspended
+ */
+int blk_pre_runtime_suspend(struct request_queue *q)
+{
+       int ret = 0;
+
+       spin_lock_irq(q->queue_lock);
+       if (q->nr_pending) {
+               ret = -EBUSY;
+               pm_runtime_mark_last_busy(q->dev);
+       } else {
+               q->rpm_status = RPM_SUSPENDING;
+       }
+       spin_unlock_irq(q->queue_lock);
+       return ret;
+}
+EXPORT_SYMBOL(blk_pre_runtime_suspend);
+
+/**
+ * blk_post_runtime_suspend - Post runtime suspend processing
+ * @q: the queue of the device
+ * @err: return value of the device's runtime_suspend function
+ *
+ * Description:
+ *    Update the queue's runtime status according to the return value of the
+ *    device's runtime suspend function and mark last busy for the device so
+ *    that PM core will try to auto suspend the device at a later time.
+ *
+ *    This function should be called near the end of the device's
+ *    runtime_suspend callback.
+ */
+void blk_post_runtime_suspend(struct request_queue *q, int err)
+{
+       spin_lock_irq(q->queue_lock);
+       if (!err) {
+               q->rpm_status = RPM_SUSPENDED;
+       } else {
+               q->rpm_status = RPM_ACTIVE;
+               pm_runtime_mark_last_busy(q->dev);
+       }
+       spin_unlock_irq(q->queue_lock);
+}
+EXPORT_SYMBOL(blk_post_runtime_suspend);
+
+/**
+ * blk_pre_runtime_resume - Pre runtime resume processing
+ * @q: the queue of the device
+ *
+ * Description:
+ *    Update the queue's runtime status to RESUMING in preparation for the
+ *    runtime resume of the device.
+ *
+ *    This function should be called near the start of the device's
+ *    runtime_resume callback.
+ */
+void blk_pre_runtime_resume(struct request_queue *q)
+{
+       spin_lock_irq(q->queue_lock);
+       q->rpm_status = RPM_RESUMING;
+       spin_unlock_irq(q->queue_lock);
+}
+EXPORT_SYMBOL(blk_pre_runtime_resume);
+
+/**
+ * blk_post_runtime_resume - Post runtime resume processing
+ * @q: the queue of the device
+ * @err: return value of the device's runtime_resume function
+ *
+ * Description:
+ *    Update the queue's runtime status according to the return value of the
+ *    device's runtime_resume function. If it is successfully resumed, process
+ *    the requests that are queued into the device's queue when it is resuming
+ *    and then mark last busy and initiate autosuspend for it.
+ *
+ *    This function should be called near the end of the device's
+ *    runtime_resume callback.
+ */
+void blk_post_runtime_resume(struct request_queue *q, int err)
+{
+       spin_lock_irq(q->queue_lock);
+       if (!err) {
+               q->rpm_status = RPM_ACTIVE;
+               __blk_run_queue(q);
+               pm_runtime_mark_last_busy(q->dev);
+               pm_runtime_autosuspend(q->dev);
+       } else {
+               q->rpm_status = RPM_SUSPENDED;
+       }
+       spin_unlock_irq(q->queue_lock);
+}
+EXPORT_SYMBOL(blk_post_runtime_resume);
+#endif
+
  int __init blk_dev_init(void)
  {
         BUILD_BUG_ON(__REQ_NR_BITS > 8 *
diff --git a/block/cfq-iosched.c b/block/cfq-iosched.c

index 4f0ade74cfd04a1c48f22218a6c4369517efa88b..d5cd3131c57a36645bcf049e28ab9a46ab5ea559 100644 (file)
--- a/block/cfq-iosched.c
+++ b/block/cfq-iosched.c
@@ -2270,11 +2270,8 @@ cfq_find_rq_fmerge(struct cfq_data *cfqd, struct bio *bio)
                 return NULL;
  
         cfqq = cic_to_cfqq(cic, cfq_bio_sync(bio));
-       if (cfqq) {
-               sector_t sector = bio->bi_sector + bio_sectors(bio);
-
-               return elv_rb_find(&cfqq->sort_list, sector);
-       }
+       if (cfqq)
+               return elv_rb_find(&cfqq->sort_list, bio_end_sector(bio));
  
         return NULL;
  }
diff --git a/block/deadline-iosched.c b/block/deadline-iosched.c

index 90037b5eb17fca86bc2212241a6d1916cf83960c..ba19a3afab7929cf3ff0857683ccaef40a007a77 100644 (file)
--- a/block/deadline-iosched.c
+++ b/block/deadline-iosched.c
@@ -132,7 +132,7 @@ deadline_merge(struct request_queue *q, struct request **req, struct bio *bio)
          * check for front merge
          */
         if (dd->front_merges) {
-               sector_t sector = bio->bi_sector + bio_sectors(bio);
+               sector_t sector = bio_end_sector(bio);
  
                 __rq = elv_rb_find(&dd->sort_list[bio_data_dir(bio)], sector);
                 if (__rq) {
diff --git a/block/elevator.c b/block/elevator.c

index a0ffdd943c98aa5e0f39f102b98f1bc4cf9777f5..eba5b04c29b135bdc6b84ac80690d380d59dd8a5 100644 (file)
--- a/block/elevator.c
+++ b/block/elevator.c
@@ -34,6 +34,7 @@
  #include <linux/blktrace_api.h>
  #include <linux/hash.h>
  #include <linux/uaccess.h>
+#include <linux/pm_runtime.h>
  
  #include <trace/events/block.h>
  
@@ -536,6 +537,27 @@ void elv_bio_merged(struct request_queue *q, struct request *rq,
                 e->type->ops.elevator_bio_merged_fn(q, rq, bio);
  }
  
+#ifdef CONFIG_PM_RUNTIME
+static void blk_pm_requeue_request(struct request *rq)
+{
+       if (rq->q->dev && !(rq->cmd_flags & REQ_PM))
+               rq->q->nr_pending--;
+}
+
+static void blk_pm_add_request(struct request_queue *q, struct request *rq)
+{
+       if (q->dev && !(rq->cmd_flags & REQ_PM) && q->nr_pending++ == 0 &&
+           (q->rpm_status == RPM_SUSPENDED || q->rpm_status == RPM_SUSPENDING))
+               pm_request_resume(q->dev);
+}
+#else
+static inline void blk_pm_requeue_request(struct request *rq) {}
+static inline void blk_pm_add_request(struct request_queue *q,
+                                     struct request *rq)
+{
+}
+#endif
+
  void elv_requeue_request(struct request_queue *q, struct request *rq)
  {
         /*
@@ -550,6 +572,8 @@ void elv_requeue_request(struct request_queue *q, struct request *rq)
  
         rq->cmd_flags &= ~REQ_STARTED;
  
+       blk_pm_requeue_request(rq);
+
         __elv_add_request(q, rq, ELEVATOR_INSERT_REQUEUE);
  }
  
@@ -572,6 +596,8 @@ void __elv_add_request(struct request_queue *q, struct request *rq, int where)
  {
         trace_block_rq_insert(q, rq);
  
+       blk_pm_add_request(q, rq);
+
         rq->q = q;
  
         if (rq->cmd_flags & REQ_SOFTBARRIER) {
diff --git a/block/partitions/efi.c b/block/partitions/efi.c

index ff5804e2f1d2ffeb40d7d7446af9867e0b48bd03..c85fc895ecdbbeba25b624e1af909bfa61f3acc2 100644 (file)
--- a/block/partitions/efi.c
+++ b/block/partitions/efi.c
@@ -238,7 +238,7 @@ static gpt_entry *alloc_read_gpt_entries(struct parsed_partitions *state,
                  le32_to_cpu(gpt->sizeof_partition_entry);
         if (!count)
                 return NULL;
-       pte = kzalloc(count, GFP_KERNEL);
+       pte = kmalloc(count, GFP_KERNEL);
         if (!pte)
                 return NULL;
  
@@ -267,7 +267,7 @@ static gpt_header *alloc_read_gpt_header(struct parsed_partitions *state,
         gpt_header *gpt;
         unsigned ssz = bdev_logical_block_size(state->bdev);
  
-       gpt = kzalloc(ssz, GFP_KERNEL);
+       gpt = kmalloc(ssz, GFP_KERNEL);
         if (!gpt)
                 return NULL;
  
diff --git a/block/scsi_ioctl.c b/block/scsi_ioctl.c

index 9a87daa6f4fbd10202ea9d47ae1a549c806a6fb4..a5ffcc988f0b00441a29f76e63e874892d121350 100644 (file)
--- a/block/scsi_ioctl.c
+++ b/block/scsi_ioctl.c
@@ -27,6 +27,7 @@
  #include <linux/ratelimit.h>
  #include <linux/slab.h>
  #include <linux/times.h>
+#include <linux/uio.h>
  #include <asm/uaccess.h>
  
  #include <scsi/scsi.h>
diff --git a/drivers/bcma/driver_mips.c b/drivers/bcma/driver_mips.c

index 9a7f0e3ab5a33fbb012a5c2f74145cc47c346dcd..11115bbe115cb120c778b4818d1e14a05f2e012e 100644 (file)
--- a/drivers/bcma/driver_mips.c
+++ b/drivers/bcma/driver_mips.c
@@ -21,7 +21,7 @@
  #include <linux/serial_reg.h>
  #include <linux/time.h>
  
-static const char *part_probes[] = { "bcm47xxpart", NULL };
+static const char * const part_probes[] = { "bcm47xxpart", NULL };
  
  static struct physmap_flash_data bcma_pflash_data = {
         .part_probe_types       = part_probes,
diff --git a/drivers/block/amiflop.c b/drivers/block/amiflop.c

index 386146d792d1a5e51ae8af06ed1ec001d08d1b06..4ff85b8785ee180a5637f56a43ab87f61dcbe2c6 100644 (file)
--- a/drivers/block/amiflop.c
+++ b/drivers/block/amiflop.c
@@ -1634,7 +1634,7 @@ static int floppy_open(struct block_device *bdev, fmode_t mode)
         return 0;
  }
  
-static int floppy_release(struct gendisk *disk, fmode_t mode)
+static void floppy_release(struct gendisk *disk, fmode_t mode)
  {
         struct amiga_floppy_struct *p = disk->private_data;
         int drive = p - unit;
@@ -1654,7 +1654,6 @@ static int floppy_release(struct gendisk *disk, fmode_t mode)
         floppy_off (drive | 0x40000000);
  #endif
         mutex_unlock(&amiflop_mutex);
-       return 0;
  }
  
  /*
diff --git a/drivers/block/aoe/aoeblk.c b/drivers/block/aoe/aoeblk.c

index a129f8c8073db35469f4c0f8232ed3eb44f51d05..916d9ed5c8aa6d1f1ae3873a924f056015cd651c 100644 (file)
--- a/drivers/block/aoe/aoeblk.c
+++ b/drivers/block/aoe/aoeblk.c
@@ -169,7 +169,7 @@ aoeblk_open(struct block_device *bdev, fmode_t mode)
         return -ENODEV;
  }
  
-static int
+static void
  aoeblk_release(struct gendisk *disk, fmode_t mode)
  {
         struct aoedev *d = disk->private_data;
@@ -180,11 +180,9 @@ aoeblk_release(struct gendisk *disk, fmode_t mode)
         if (--d->nopen == 0) {
                 spin_unlock_irqrestore(&d->lock, flags);
                 aoecmd_cfg(d->aoemajor, d->aoeminor);
-               return 0;
+               return;
         }
         spin_unlock_irqrestore(&d->lock, flags);
-
-       return 0;
  }
  
  static void
diff --git a/drivers/block/aoe/aoecmd.c b/drivers/block/aoe/aoecmd.c

index 92b6d7c51e39590b3780f17c88737009b7becbbf..fc803ecbbce4a2d9d12a27d6a822f70a42b7732d 100644 (file)
--- a/drivers/block/aoe/aoecmd.c
+++ b/drivers/block/aoe/aoecmd.c
@@ -920,16 +920,14 @@ bio_pagedec(struct bio *bio)
  static void
  bufinit(struct buf *buf, struct request *rq, struct bio *bio)
  {
-       struct bio_vec *bv;
-
         memset(buf, 0, sizeof(*buf));
         buf->rq = rq;
         buf->bio = bio;
         buf->resid = bio->bi_size;
         buf->sector = bio->bi_sector;
         bio_pageinc(bio);
-       buf->bv = bv = &bio->bi_io_vec[bio->bi_idx];
-       buf->bv_resid = bv->bv_len;
+       buf->bv = bio_iovec(bio);
+       buf->bv_resid = buf->bv->bv_len;
         WARN_ON(buf->bv_resid == 0);
  }
  
diff --git a/drivers/block/ataflop.c b/drivers/block/ataflop.c

index ede16c64ff07dbb524cdf84ba8e3622d9efe306b..0e30c6e5492a54e9cd703271e9c3e7b5ec5d59b3 100644 (file)
--- a/drivers/block/ataflop.c
+++ b/drivers/block/ataflop.c
@@ -367,7 +367,7 @@ static void fd_probe( int drive );
  static int fd_test_drive_present( int drive );
  static void config_types( void );
  static int floppy_open(struct block_device *bdev, fmode_t mode);
-static int floppy_release(struct gendisk *disk, fmode_t mode);
+static void floppy_release(struct gendisk *disk, fmode_t mode);
  
  /************************* End of Prototypes **************************/
  
@@ -1886,7 +1886,7 @@ static int floppy_unlocked_open(struct block_device *bdev, fmode_t mode)
         return ret;
  }
  
-static int floppy_release(struct gendisk *disk, fmode_t mode)
+static void floppy_release(struct gendisk *disk, fmode_t mode)
  {
         struct atari_floppy_struct *p = disk->private_data;
         mutex_lock(&ataflop_mutex);
@@ -1897,7 +1897,6 @@ static int floppy_release(struct gendisk *disk, fmode_t mode)
                 p->ref = 0;
         }
         mutex_unlock(&ataflop_mutex);
-       return 0;
  }
  
  static const struct block_device_operations floppy_fops = {
diff --git a/drivers/block/brd.c b/drivers/block/brd.c

index 531ceb31d0ff86a4aec90301ea508254ea443457..f1a29f8e9d33dbe45474172a24f952f32ce4ba47 100644 (file)
--- a/drivers/block/brd.c
+++ b/drivers/block/brd.c
@@ -334,8 +334,7 @@ static void brd_make_request(struct request_queue *q, struct bio *bio)
         int err = -EIO;
  
         sector = bio->bi_sector;
-       if (sector + (bio->bi_size >> SECTOR_SHIFT) >
-                                               get_capacity(bdev->bd_disk))
+       if (bio_end_sector(bio) > get_capacity(bdev->bd_disk))
                 goto out;
  
         if (unlikely(bio->bi_rw & REQ_DISCARD)) {
diff --git a/drivers/block/cciss.c b/drivers/block/cciss.c

index e18c99140c0aa37e2df84c0dd718ed794f9c2265..6374dc103521f451863cb988cc68921448eb2ae1 100644 (file)
--- a/drivers/block/cciss.c
+++ b/drivers/block/cciss.c
@@ -75,6 +75,12 @@ module_param(cciss_simple_mode, int, S_IRUGO|S_IWUSR);
  MODULE_PARM_DESC(cciss_simple_mode,
         "Use 'simple mode' rather than 'performant mode'");
  
+static int cciss_allow_hpsa;
+module_param(cciss_allow_hpsa, int, S_IRUGO|S_IWUSR);
+MODULE_PARM_DESC(cciss_allow_hpsa,
+       "Prevent cciss driver from accessing hardware known to be "
+       " supported by the hpsa driver");
+
  static DEFINE_MUTEX(cciss_mutex);
  static struct proc_dir_entry *proc_cciss;
  
@@ -161,7 +167,7 @@ static irqreturn_t do_cciss_intx(int irq, void *dev_id);
  static irqreturn_t do_cciss_msix_intr(int irq, void *dev_id);
  static int cciss_open(struct block_device *bdev, fmode_t mode);
  static int cciss_unlocked_open(struct block_device *bdev, fmode_t mode);
-static int cciss_release(struct gendisk *disk, fmode_t mode);
+static void cciss_release(struct gendisk *disk, fmode_t mode);
  static int do_ioctl(struct block_device *bdev, fmode_t mode,
                     unsigned int cmd, unsigned long arg);
  static int cciss_ioctl(struct block_device *bdev, fmode_t mode,
@@ -1123,7 +1129,7 @@ static int cciss_unlocked_open(struct block_device *bdev, fmode_t mode)
  /*
   * Close.  Sync first.
   */
-static int cciss_release(struct gendisk *disk, fmode_t mode)
+static void cciss_release(struct gendisk *disk, fmode_t mode)
  {
         ctlr_info_t *h;
         drive_info_struct *drv;
@@ -1135,7 +1141,6 @@ static int cciss_release(struct gendisk *disk, fmode_t mode)
         drv->usage_count--;
         h->usage_count--;
         mutex_unlock(&cciss_mutex);
-       return 0;
  }
  
  static int do_ioctl(struct block_device *bdev, fmode_t mode,
@@ -4116,9 +4121,13 @@ static int cciss_lookup_board_id(struct pci_dev *pdev, u32 *board_id)
         *board_id = ((subsystem_device_id << 16) & 0xffff0000) |
                         subsystem_vendor_id;
  
-       for (i = 0; i < ARRAY_SIZE(products); i++)
+       for (i = 0; i < ARRAY_SIZE(products); i++) {
+               /* Stand aside for hpsa driver on request */
+               if (cciss_allow_hpsa)
+                       return -ENODEV;
                 if (*board_id == products[i].board_id)
                         return i;
+       }
         dev_warn(&pdev->dev, "unrecognized board ID: 0x%08x, ignoring.\n",
                 *board_id);
         return -ENODEV;
@@ -4960,6 +4969,16 @@ static int cciss_init_one(struct pci_dev *pdev, const struct pci_device_id *ent)
         ctlr_info_t *h;
         unsigned long flags;
  
+       /*
+        * By default the cciss driver is used for all older HP Smart Array
+        * controllers. There are module paramaters that allow a user to
+        * override this behavior and instead use the hpsa SCSI driver. If
+        * this is the case cciss may be loaded first from the kdump initrd
+        * image and cause a kernel panic. So if reset_devices is true and
+        * cciss_allow_hpsa is set just bail.
+        */
+       if ((reset_devices) && (cciss_allow_hpsa == 1))
+               return -ENODEV;
         rc = cciss_init_reset_devices(pdev);
         if (rc) {
                 if (rc != -ENOTSUPP)
diff --git a/drivers/block/cpqarray.c b/drivers/block/cpqarray.c

index 3b9e8ebcb96bbbf1d367cb749ce7a7b5c7de029f..639d26b90b9117a56c69f991663f603847cc206c 100644 (file)
--- a/drivers/block/cpqarray.c
+++ b/drivers/block/cpqarray.c
@@ -160,7 +160,7 @@ static int sendcmd(
         unsigned int log_unit );
  
  static int ida_unlocked_open(struct block_device *bdev, fmode_t mode);
-static int ida_release(struct gendisk *disk, fmode_t mode);
+static void ida_release(struct gendisk *disk, fmode_t mode);
  static int ida_ioctl(struct block_device *bdev, fmode_t mode, unsigned int cmd, unsigned long arg);
  static int ida_getgeo(struct block_device *bdev, struct hd_geometry *geo);
  static int ida_ctlr_ioctl(ctlr_info_t *h, int dsk, ida_ioctl_t *io);
@@ -856,7 +856,7 @@ static int ida_unlocked_open(struct block_device *bdev, fmode_t mode)
  /*
   * Close.  Sync first.
   */
-static int ida_release(struct gendisk *disk, fmode_t mode)
+static void ida_release(struct gendisk *disk, fmode_t mode)
  {
         ctlr_info_t *host;
  
@@ -864,8 +864,6 @@ static int ida_release(struct gendisk *disk, fmode_t mode)
         host = get_host(disk);
         host->usage_count--;
         mutex_unlock(&cpqarray_mutex);
-
-       return 0;
  }
  
  /*
diff --git a/drivers/block/drbd/drbd_actlog.c b/drivers/block/drbd/drbd_actlog.c

index 92510f8ad0131f480aac0fddc48d8e889f8d690f..6608076dc39e73cda5f3aaf705579e3189421e3e 100644 (file)
--- a/drivers/block/drbd/drbd_actlog.c
+++ b/drivers/block/drbd/drbd_actlog.c
@@ -104,7 +104,6 @@ struct update_al_work {
         int err;
  };
  
-static int al_write_transaction(struct drbd_conf *mdev);
  
  void *drbd_md_get_buffer(struct drbd_conf *mdev)
  {
@@ -168,7 +167,11 @@ static int _drbd_md_sync_page_io(struct drbd_conf *mdev,
         bio->bi_end_io = drbd_md_io_complete;
         bio->bi_rw = rw;
  
-       if (!get_ldev_if_state(mdev, D_ATTACHING)) {  /* Corresponding put_ldev in drbd_md_io_complete() */
+       if (!(rw & WRITE) && mdev->state.disk == D_DISKLESS && mdev->ldev == NULL)
+               /* special case, drbd_md_read() during drbd_adm_attach(): no get_ldev */
+               ;
+       else if (!get_ldev_if_state(mdev, D_ATTACHING)) {
+               /* Corresponding put_ldev in drbd_md_io_complete() */
                 dev_err(DEV, "ASSERT FAILED: get_ldev_if_state() == 1 in _drbd_md_sync_page_io()\n");
                 err = -ENODEV;
                 goto out;
@@ -199,9 +202,10 @@ int drbd_md_sync_page_io(struct drbd_conf *mdev, struct drbd_backing_dev *bdev,
  
         BUG_ON(!bdev->md_bdev);
  
-       dev_dbg(DEV, "meta_data io: %s [%d]:%s(,%llus,%s)\n",
+       dev_dbg(DEV, "meta_data io: %s [%d]:%s(,%llus,%s) %pS\n",
              current->comm, current->pid, __func__,
-            (unsigned long long)sector, (rw & WRITE) ? "WRITE" : "READ");
+            (unsigned long long)sector, (rw & WRITE) ? "WRITE" : "READ",
+            (void*)_RET_IP_ );
  
         if (sector < drbd_md_first_sector(bdev) ||
             sector + 7 > drbd_md_last_sector(bdev))
@@ -209,7 +213,8 @@ int drbd_md_sync_page_io(struct drbd_conf *mdev, struct drbd_backing_dev *bdev,
                      current->comm, current->pid, __func__,
                      (unsigned long long)sector, (rw & WRITE) ? "WRITE" : "READ");
  
-       err = _drbd_md_sync_page_io(mdev, bdev, iop, sector, rw, MD_BLOCK_SIZE);
+       /* we do all our meta data IO in aligned 4k blocks. */
+       err = _drbd_md_sync_page_io(mdev, bdev, iop, sector, rw, 4096);
         if (err) {
                 dev_err(DEV, "drbd_md_sync_page_io(,%llus,%s) failed with error %d\n",
                     (unsigned long long)sector, (rw & WRITE) ? "WRITE" : "READ", err);
@@ -217,44 +222,99 @@ int drbd_md_sync_page_io(struct drbd_conf *mdev, struct drbd_backing_dev *bdev,
         return err;
  }
  
-static struct lc_element *_al_get(struct drbd_conf *mdev, unsigned int enr)
+static struct bm_extent *find_active_resync_extent(struct drbd_conf *mdev, unsigned int enr)
  {
-       struct lc_element *al_ext;
         struct lc_element *tmp;
-       int wake;
-
-       spin_lock_irq(&mdev->al_lock);
         tmp = lc_find(mdev->resync, enr/AL_EXT_PER_BM_SECT);
         if (unlikely(tmp != NULL)) {
                 struct bm_extent  *bm_ext = lc_entry(tmp, struct bm_extent, lce);
-               if (test_bit(BME_NO_WRITES, &bm_ext->flags)) {
-                       wake = !test_and_set_bit(BME_PRIORITY, &bm_ext->flags);
-                       spin_unlock_irq(&mdev->al_lock);
-                       if (wake)
-                               wake_up(&mdev->al_wait);
-                       return NULL;
-               }
+               if (test_bit(BME_NO_WRITES, &bm_ext->flags))
+                       return bm_ext;
+       }
+       return NULL;
+}
+
+static struct lc_element *_al_get(struct drbd_conf *mdev, unsigned int enr, bool nonblock)
+{
+       struct lc_element *al_ext;
+       struct bm_extent *bm_ext;
+       int wake;
+
+       spin_lock_irq(&mdev->al_lock);
+       bm_ext = find_active_resync_extent(mdev, enr);
+       if (bm_ext) {
+               wake = !test_and_set_bit(BME_PRIORITY, &bm_ext->flags);
+               spin_unlock_irq(&mdev->al_lock);
+               if (wake)
+                       wake_up(&mdev->al_wait);
+               return NULL;
         }
-       al_ext = lc_get(mdev->act_log, enr);
+       if (nonblock)
+               al_ext = lc_try_get(mdev->act_log, enr);
+       else
+               al_ext = lc_get(mdev->act_log, enr);
         spin_unlock_irq(&mdev->al_lock);
         return al_ext;
  }
  
-void drbd_al_begin_io(struct drbd_conf *mdev, struct drbd_interval *i)
+bool drbd_al_begin_io_fastpath(struct drbd_conf *mdev, struct drbd_interval *i)
  {
         /* for bios crossing activity log extent boundaries,
          * we may need to activate two extents in one go */
         unsigned first = i->sector >> (AL_EXTENT_SHIFT-9);
         unsigned last = i->size == 0 ? first : (i->sector + (i->size >> 9) - 1) >> (AL_EXTENT_SHIFT-9);
-       unsigned enr;
-       bool locked = false;
  
+       D_ASSERT((unsigned)(last - first) <= 1);
+       D_ASSERT(atomic_read(&mdev->local_cnt) > 0);
+
+       /* FIXME figure out a fast path for bios crossing AL extent boundaries */
+       if (first != last)
+               return false;
+
+       return _al_get(mdev, first, true);
+}
+
+bool drbd_al_begin_io_prepare(struct drbd_conf *mdev, struct drbd_interval *i)
+{
+       /* for bios crossing activity log extent boundaries,
+        * we may need to activate two extents in one go */
+       unsigned first = i->sector >> (AL_EXTENT_SHIFT-9);
+       unsigned last = i->size == 0 ? first : (i->sector + (i->size >> 9) - 1) >> (AL_EXTENT_SHIFT-9);
+       unsigned enr;
+       bool need_transaction = false;
  
         D_ASSERT(first <= last);
         D_ASSERT(atomic_read(&mdev->local_cnt) > 0);
  
-       for (enr = first; enr <= last; enr++)
-               wait_event(mdev->al_wait, _al_get(mdev, enr) != NULL);
+       for (enr = first; enr <= last; enr++) {
+               struct lc_element *al_ext;
+               wait_event(mdev->al_wait,
+                               (al_ext = _al_get(mdev, enr, false)) != NULL);
+               if (al_ext->lc_number != enr)
+                       need_transaction = true;
+       }
+       return need_transaction;
+}
+
+static int al_write_transaction(struct drbd_conf *mdev, bool delegate);
+
+/* When called through generic_make_request(), we must delegate
+ * activity log I/O to the worker thread: a further request
+ * submitted via generic_make_request() within the same task
+ * would be queued on current->bio_list, and would only start
+ * after this function returns (see generic_make_request()).
+ *
+ * However, if we *are* the worker, we must not delegate to ourselves.
+ */
+
+/*
+ * @delegate:   delegate activity log I/O to the worker thread
+ */
+void drbd_al_begin_io_commit(struct drbd_conf *mdev, bool delegate)
+{
+       bool locked = false;
+
+       BUG_ON(delegate && current == mdev->tconn->worker.task);
  
         /* Serialize multiple transactions.
          * This uses test_and_set_bit, memory barrier is implicit.
@@ -264,13 +324,6 @@ void drbd_al_begin_io(struct drbd_conf *mdev, struct drbd_interval *i)
                         (locked = lc_try_lock_for_transaction(mdev->act_log)));
  
         if (locked) {
-               /* drbd_al_write_transaction(mdev,al_ext,enr);
-                * recurses into generic_make_request(), which
-                * disallows recursion, bios being serialized on the
-                * current->bio_tail list now.
-                * we have to delegate updates to the activity log
-                * to the worker thread. */
-
                 /* Double check: it may have been committed by someone else,
                  * while we have been waiting for the lock. */
                 if (mdev->act_log->pending_changes) {
@@ -280,11 +333,8 @@ void drbd_al_begin_io(struct drbd_conf *mdev, struct drbd_interval *i)
                         write_al_updates = rcu_dereference(mdev->ldev->disk_conf)->al_updates;
                         rcu_read_unlock();
  
-                       if (write_al_updates) {
-                               al_write_transaction(mdev);
-                               mdev->al_writ_cnt++;
-                       }
-
+                       if (write_al_updates)
+                               al_write_transaction(mdev, delegate);
                         spin_lock_irq(&mdev->al_lock);
                         /* FIXME
                         if (err)
@@ -298,6 +348,66 @@ void drbd_al_begin_io(struct drbd_conf *mdev, struct drbd_interval *i)
         }
  }
  
+/*
+ * @delegate:   delegate activity log I/O to the worker thread
+ */
+void drbd_al_begin_io(struct drbd_conf *mdev, struct drbd_interval *i, bool delegate)
+{
+       BUG_ON(delegate && current == mdev->tconn->worker.task);
+
+       if (drbd_al_begin_io_prepare(mdev, i))
+               drbd_al_begin_io_commit(mdev, delegate);
+}
+
+int drbd_al_begin_io_nonblock(struct drbd_conf *mdev, struct drbd_interval *i)
+{
+       struct lru_cache *al = mdev->act_log;
+       /* for bios crossing activity log extent boundaries,
+        * we may need to activate two extents in one go */
+       unsigned first = i->sector >> (AL_EXTENT_SHIFT-9);
+       unsigned last = i->size == 0 ? first : (i->sector + (i->size >> 9) - 1) >> (AL_EXTENT_SHIFT-9);
+       unsigned nr_al_extents;
+       unsigned available_update_slots;
+       unsigned enr;
+
+       D_ASSERT(first <= last);
+
+       nr_al_extents = 1 + last - first; /* worst case: all touched extends are cold. */
+       available_update_slots = min(al->nr_elements - al->used,
+                               al->max_pending_changes - al->pending_changes);
+
+       /* We want all necessary updates for a given request within the same transaction
+        * We could first check how many updates are *actually* needed,
+        * and use that instead of the worst-case nr_al_extents */
+       if (available_update_slots < nr_al_extents)
+               return -EWOULDBLOCK;
+
+       /* Is resync active in this area? */
+       for (enr = first; enr <= last; enr++) {
+               struct lc_element *tmp;
+               tmp = lc_find(mdev->resync, enr/AL_EXT_PER_BM_SECT);
+               if (unlikely(tmp != NULL)) {
+                       struct bm_extent  *bm_ext = lc_entry(tmp, struct bm_extent, lce);
+                       if (test_bit(BME_NO_WRITES, &bm_ext->flags)) {
+                               if (!test_and_set_bit(BME_PRIORITY, &bm_ext->flags))
+                                       return -EBUSY;
+                               return -EWOULDBLOCK;
+                       }
+               }
+       }
+
+       /* Checkout the refcounts.
+        * Given that we checked for available elements and update slots above,
+        * this has to be successful. */
+       for (enr = first; enr <= last; enr++) {
+               struct lc_element *al_ext;
+               al_ext = lc_get_cumulative(mdev->act_log, enr);
+               if (!al_ext)
+                       dev_info(DEV, "LOGIC BUG for enr=%u\n", enr);
+       }
+       return 0;
+}
+
  void drbd_al_complete_io(struct drbd_conf *mdev, struct drbd_interval *i)
  {
         /* for bios crossing activity log extent boundaries,
@@ -350,6 +460,24 @@ static unsigned int rs_extent_to_bm_page(unsigned int rs_enr)
                  (BM_EXT_SHIFT - BM_BLOCK_SHIFT));
  }
  
+static sector_t al_tr_number_to_on_disk_sector(struct drbd_conf *mdev)
+{
+       const unsigned int stripes = mdev->ldev->md.al_stripes;
+       const unsigned int stripe_size_4kB = mdev->ldev->md.al_stripe_size_4k;
+
+       /* transaction number, modulo on-disk ring buffer wrap around */
+       unsigned int t = mdev->al_tr_number % (mdev->ldev->md.al_size_4k);
+
+       /* ... to aligned 4k on disk block */
+       t = ((t % stripes) * stripe_size_4kB) + t/stripes;
+
+       /* ... to 512 byte sector in activity log */
+       t *= 8;
+
+       /* ... plus offset to the on disk position */
+       return mdev->ldev->md.md_offset + mdev->ldev->md.al_offset + t;
+}
+
  static int
  _al_write_transaction(struct drbd_conf *mdev)
  {
@@ -432,23 +560,27 @@ _al_write_transaction(struct drbd_conf *mdev)
         if (mdev->al_tr_cycle >= mdev->act_log->nr_elements)
                 mdev->al_tr_cycle = 0;
  
-       sector =  mdev->ldev->md.md_offset
-               + mdev->ldev->md.al_offset
-               + mdev->al_tr_pos * (MD_BLOCK_SIZE>>9);
+       sector = al_tr_number_to_on_disk_sector(mdev);
  
         crc = crc32c(0, buffer, 4096);
         buffer->crc32c = cpu_to_be32(crc);
  
         if (drbd_bm_write_hinted(mdev))
                 err = -EIO;
-               /* drbd_chk_io_error done already */
-       else if (drbd_md_sync_page_io(mdev, mdev->ldev, sector, WRITE)) {
-               err = -EIO;
-               drbd_chk_io_error(mdev, 1, DRBD_META_IO_ERROR);
-       } else {
-               /* advance ringbuffer position and transaction counter */
-               mdev->al_tr_pos = (mdev->al_tr_pos + 1) % (MD_AL_SECTORS*512/MD_BLOCK_SIZE);
-               mdev->al_tr_number++;
+       else {
+               bool write_al_updates;
+               rcu_read_lock();
+               write_al_updates = rcu_dereference(mdev->ldev->disk_conf)->al_updates;
+               rcu_read_unlock();
+               if (write_al_updates) {
+                       if (drbd_md_sync_page_io(mdev, mdev->ldev, sector, WRITE)) {
+                               err = -EIO;
+                               drbd_chk_io_error(mdev, 1, DRBD_META_IO_ERROR);
+                       } else {
+                               mdev->al_tr_number++;
+                               mdev->al_writ_cnt++;
+                       }
+               }
         }
  
         drbd_md_put_buffer(mdev);
@@ -474,20 +606,18 @@ static int w_al_write_transaction(struct drbd_work *w, int unused)
  /* Calls from worker context (see w_restart_disk_io()) need to write the
     transaction directly. Others came through generic_make_request(),
     those need to delegate it to the worker. */
-static int al_write_transaction(struct drbd_conf *mdev)
+static int al_write_transaction(struct drbd_conf *mdev, bool delegate)
  {
-       struct update_al_work al_work;
-
-       if (current == mdev->tconn->worker.task)
+       if (delegate) {
+               struct update_al_work al_work;
+               init_completion(&al_work.event);
+               al_work.w.cb = w_al_write_transaction;
+               al_work.w.mdev = mdev;
+               drbd_queue_work_front(&mdev->tconn->sender_work, &al_work.w);
+               wait_for_completion(&al_work.event);
+               return al_work.err;
+       } else
                 return _al_write_transaction(mdev);
-
-       init_completion(&al_work.event);
-       al_work.w.cb = w_al_write_transaction;
-       al_work.w.mdev = mdev;
-       drbd_queue_work_front(&mdev->tconn->sender_work, &al_work.w);
-       wait_for_completion(&al_work.event);
-
-       return al_work.err;
  }
  
  static int _try_lc_del(struct drbd_conf *mdev, struct lc_element *al_ext)
diff --git a/drivers/block/drbd/drbd_bitmap.c b/drivers/block/drbd/drbd_bitmap.c

index 8dc29502dc08c172ce8d693e78a1578c72bc4562..64fbb8385cdc87575ce499d888919ac3cb52b81e 100644 (file)
--- a/drivers/block/drbd/drbd_bitmap.c
+++ b/drivers/block/drbd/drbd_bitmap.c
@@ -612,6 +612,17 @@ static void bm_memset(struct drbd_bitmap *b, size_t offset, int c, size_t len)
         }
  }
  
+/* For the layout, see comment above drbd_md_set_sector_offsets(). */
+static u64 drbd_md_on_disk_bits(struct drbd_backing_dev *ldev)
+{
+       u64 bitmap_sectors;
+       if (ldev->md.al_offset == 8)
+               bitmap_sectors = ldev->md.md_size_sect - ldev->md.bm_offset;
+       else
+               bitmap_sectors = ldev->md.al_offset - ldev->md.bm_offset;
+       return bitmap_sectors << (9 + 3);
+}
+
  /*
   * make sure the bitmap has enough room for the attached storage,
   * if necessary, resize.
@@ -668,7 +679,7 @@ int drbd_bm_resize(struct drbd_conf *mdev, sector_t capacity, int set_new_bits)
         words = ALIGN(bits, 64) >> LN2_BPL;
  
         if (get_ldev(mdev)) {
-               u64 bits_on_disk = ((u64)mdev->ldev->md.md_size_sect-MD_BM_OFFSET) << 12;
+               u64 bits_on_disk = drbd_md_on_disk_bits(mdev->ldev);
                 put_ldev(mdev);
                 if (bits > bits_on_disk) {
                         dev_info(DEV, "bits = %lu\n", bits);
diff --git a/drivers/block/drbd/drbd_int.h b/drivers/block/drbd/drbd_int.h

index 6b51afa1aae19070d3ec26cf53d784ed71fab3d7..f943aacfdad8a90aa07416d905fa2a0bb207b665 100644 (file)
--- a/drivers/block/drbd/drbd_int.h
+++ b/drivers/block/drbd/drbd_int.h
@@ -753,13 +753,16 @@ struct drbd_md {
         u32 flags;
         u32 md_size_sect;
  
-       s32 al_offset;  /* signed relative sector offset to al area */
+       s32 al_offset;  /* signed relative sector offset to activity log */
         s32 bm_offset;  /* signed relative sector offset to bitmap */
  
-       /* u32 al_nr_extents;      important for restoring the AL
-        * is stored into  ldev->dc.al_extents, which in turn
-        * gets applied to act_log->nr_elements
-        */
+       /* cached value of bdev->disk_conf->meta_dev_idx (see below) */
+       s32 meta_dev_idx;
+
+       /* see al_tr_number_to_on_disk_sector() */
+       u32 al_stripes;
+       u32 al_stripe_size_4k;
+       u32 al_size_4k; /* cached product of the above */
  };
  
  struct drbd_backing_dev {
@@ -891,6 +894,14 @@ struct drbd_tconn {                        /* is a resource from the config file */
         } send;
  };
  
+struct submit_worker {
+       struct workqueue_struct *wq;
+       struct work_struct worker;
+
+       spinlock_t lock;
+       struct list_head writes;
+};
+
  struct drbd_conf {
         struct drbd_tconn *tconn;
         int vnr;                        /* volume number within the connection */
@@ -1009,7 +1020,6 @@ struct drbd_conf {
         struct lru_cache *act_log;      /* activity log */
         unsigned int al_tr_number;
         int al_tr_cycle;
-       int al_tr_pos;   /* position of the next transaction in the journal */
         wait_queue_head_t seq_wait;
         atomic_t packet_seq;
         unsigned int peer_seq;
@@ -1032,6 +1042,10 @@ struct drbd_conf {
         atomic_t ap_in_flight; /* App sectors in flight (waiting for ack) */
         unsigned int peer_max_bio_size;
         unsigned int local_max_bio_size;
+
+       /* any requests that would block in drbd_make_request()
+        * are deferred to this single-threaded work queue */
+       struct submit_worker submit;
  };
  
  static inline struct drbd_conf *minor_to_mdev(unsigned int minor)
@@ -1148,25 +1162,44 @@ extern int drbd_bitmap_io_from_worker(struct drbd_conf *mdev,
                 char *why, enum bm_flag flags);
  extern int drbd_bmio_set_n_write(struct drbd_conf *mdev);
  extern int drbd_bmio_clear_n_write(struct drbd_conf *mdev);
-extern void drbd_go_diskless(struct drbd_conf *mdev);
  extern void drbd_ldev_destroy(struct drbd_conf *mdev);
  
  /* Meta data layout
-   We reserve a 128MB Block (4k aligned)
-   * either at the end of the backing device
-   * or on a separate meta data device. */
+ *
+ * We currently have two possible layouts.
+ * Offsets in (512 byte) sectors.
+ * external:
+ *   |----------- md_size_sect ------------------|
+ *   [ 4k superblock ][ activity log ][  Bitmap  ]
+ *   | al_offset == 8 |
+ *   | bm_offset = al_offset + X      |
+ *  ==> bitmap sectors = md_size_sect - bm_offset
+ *
+ *  Variants:
+ *     old, indexed fixed size meta data:
+ *
+ * internal:
+ *            |----------- md_size_sect ------------------|
+ * [data.....][  Bitmap  ][ activity log ][ 4k superblock ][padding*]
+ *                        | al_offset < 0 |
+ *            | bm_offset = al_offset - Y |
+ *  ==> bitmap sectors = Y = al_offset - bm_offset
+ *
+ *  [padding*] are zero or up to 7 unused 512 Byte sectors to the
+ *  end of the device, so that the [4k superblock] will be 4k aligned.
+ *
+ *  The activity log consists of 4k transaction blocks,
+ *  which are written in a ring-buffer, or striped ring-buffer like fashion,
+ *  which are writtensize used to be fixed 32kB,
+ *  but is about to become configurable.
+ */
  
-/* The following numbers are sectors */
-/* Allows up to about 3.8TB, so if you want more,
+/* Our old fixed size meta data layout
+ * allows up to about 3.8TB, so if you want more,
   * you need to use the "flexible" meta data format. */
-#define MD_RESERVED_SECT (128LU << 11)  /* 128 MB, unit sectors */
-#define MD_AL_OFFSET   8    /* 8 Sectors after start of meta area */
-#define MD_AL_SECTORS  64   /* = 32 kB on disk activity log ring buffer */
-#define MD_BM_OFFSET (MD_AL_OFFSET + MD_AL_SECTORS)
-
-/* we do all meta data IO in 4k blocks */
-#define MD_BLOCK_SHIFT 12
-#define MD_BLOCK_SIZE  (1<<MD_BLOCK_SHIFT)
+#define MD_128MB_SECT (128LLU << 11)  /* 128 MB, unit sectors */
+#define MD_4kB_SECT     8
+#define MD_32kB_SECT   64
  
  /* One activity log extent represents 4M of storage */
  #define AL_EXTENT_SHIFT 22
@@ -1256,7 +1289,6 @@ struct bm_extent {
  
  /* in one sector of the bitmap, we have this many activity_log extents. */
  #define AL_EXT_PER_BM_SECT  (1 << (BM_EXT_SHIFT - AL_EXTENT_SHIFT))
-#define BM_WORDS_PER_AL_EXT (1 << (AL_EXTENT_SHIFT-BM_BLOCK_SHIFT-LN2_BPL))
  
  #define BM_BLOCKS_PER_BM_EXT_B (BM_EXT_SHIFT - BM_BLOCK_SHIFT)
  #define BM_BLOCKS_PER_BM_EXT_MASK  ((1<<BM_BLOCKS_PER_BM_EXT_B) - 1)
@@ -1276,16 +1308,18 @@ struct bm_extent {
   */
  
  #define DRBD_MAX_SECTORS_32 (0xffffffffLU)
-#define DRBD_MAX_SECTORS_BM \
-         ((MD_RESERVED_SECT - MD_BM_OFFSET) * (1LL<<(BM_EXT_SHIFT-9)))
-#if DRBD_MAX_SECTORS_BM < DRBD_MAX_SECTORS_32
-#define DRBD_MAX_SECTORS      DRBD_MAX_SECTORS_BM
-#define DRBD_MAX_SECTORS_FLEX DRBD_MAX_SECTORS_BM
-#elif !defined(CONFIG_LBDAF) && BITS_PER_LONG == 32
+/* we have a certain meta data variant that has a fixed on-disk size of 128
+ * MiB, of which 4k are our "superblock", and 32k are the fixed size activity
+ * log, leaving this many sectors for the bitmap.
+ */
+
+#define DRBD_MAX_SECTORS_FIXED_BM \
+         ((MD_128MB_SECT - MD_32kB_SECT - MD_4kB_SECT) * (1LL<<(BM_EXT_SHIFT-9)))
+#if !defined(CONFIG_LBDAF) && BITS_PER_LONG == 32
  #define DRBD_MAX_SECTORS      DRBD_MAX_SECTORS_32
  #define DRBD_MAX_SECTORS_FLEX DRBD_MAX_SECTORS_32
  #else
-#define DRBD_MAX_SECTORS      DRBD_MAX_SECTORS_BM
+#define DRBD_MAX_SECTORS      DRBD_MAX_SECTORS_FIXED_BM
  /* 16 TB in units of sectors */
  #if BITS_PER_LONG == 32
  /* adjust by one page worth of bitmap,
@@ -1418,6 +1452,7 @@ extern void conn_free_crypto(struct drbd_tconn *tconn);
  extern int proc_details;
  
  /* drbd_req */
+extern void do_submit(struct work_struct *ws);
  extern void __drbd_make_request(struct drbd_conf *, struct bio *, unsigned long);
  extern void drbd_make_request(struct request_queue *q, struct bio *bio);
  extern int drbd_read_remote(struct drbd_conf *mdev, struct drbd_request *req);
@@ -1576,7 +1611,10 @@ extern const char *drbd_conn_str(enum drbd_conns s);
  extern const char *drbd_role_str(enum drbd_role s);
  
  /* drbd_actlog.c */
-extern void drbd_al_begin_io(struct drbd_conf *mdev, struct drbd_interval *i);
+extern int drbd_al_begin_io_nonblock(struct drbd_conf *mdev, struct drbd_interval *i);
+extern void drbd_al_begin_io_commit(struct drbd_conf *mdev, bool delegate);
+extern bool drbd_al_begin_io_fastpath(struct drbd_conf *mdev, struct drbd_interval *i);
+extern void drbd_al_begin_io(struct drbd_conf *mdev, struct drbd_interval *i, bool delegate);
  extern void drbd_al_complete_io(struct drbd_conf *mdev, struct drbd_interval *i);
  extern void drbd_rs_complete_io(struct drbd_conf *mdev, sector_t sector);
  extern int drbd_rs_begin_io(struct drbd_conf *mdev, sector_t sector);
@@ -1755,9 +1793,9 @@ static inline void drbd_chk_io_error_(struct drbd_conf *mdev,
   * BTW, for internal meta data, this happens to be the maximum capacity
   * we could agree upon with our peer node.
   */
-static inline sector_t _drbd_md_first_sector(int meta_dev_idx, struct drbd_backing_dev *bdev)
+static inline sector_t drbd_md_first_sector(struct drbd_backing_dev *bdev)
  {
-       switch (meta_dev_idx) {
+       switch (bdev->md.meta_dev_idx) {
         case DRBD_MD_INDEX_INTERNAL:
         case DRBD_MD_INDEX_FLEX_INT:
                 return bdev->md.md_offset + bdev->md.bm_offset;
@@ -1767,36 +1805,19 @@ static inline sector_t _drbd_md_first_sector(int meta_dev_idx, struct drbd_backi
         }
  }
  
-static inline sector_t drbd_md_first_sector(struct drbd_backing_dev *bdev)
-{
-       int meta_dev_idx;
-
-       rcu_read_lock();
-       meta_dev_idx = rcu_dereference(bdev->disk_conf)->meta_dev_idx;
-       rcu_read_unlock();
-
-       return _drbd_md_first_sector(meta_dev_idx, bdev);
-}
-
  /**
   * drbd_md_last_sector() - Return the last sector number of the meta data area
   * @bdev:      Meta data block device.
   */
  static inline sector_t drbd_md_last_sector(struct drbd_backing_dev *bdev)
  {
-       int meta_dev_idx;
-
-       rcu_read_lock();
-       meta_dev_idx = rcu_dereference(bdev->disk_conf)->meta_dev_idx;
-       rcu_read_unlock();
-
-       switch (meta_dev_idx) {
+       switch (bdev->md.meta_dev_idx) {
         case DRBD_MD_INDEX_INTERNAL:
         case DRBD_MD_INDEX_FLEX_INT:
-               return bdev->md.md_offset + MD_AL_OFFSET - 1;
+               return bdev->md.md_offset + MD_4kB_SECT -1;
         case DRBD_MD_INDEX_FLEX_EXT:
         default:
-               return bdev->md.md_offset + bdev->md.md_size_sect;
+               return bdev->md.md_offset + bdev->md.md_size_sect -1;
         }
  }
  
@@ -1818,18 +1839,13 @@ static inline sector_t drbd_get_capacity(struct block_device *bdev)
  static inline sector_t drbd_get_max_capacity(struct drbd_backing_dev *bdev)
  {
         sector_t s;
-       int meta_dev_idx;
  
-       rcu_read_lock();
-       meta_dev_idx = rcu_dereference(bdev->disk_conf)->meta_dev_idx;
-       rcu_read_unlock();
-
-       switch (meta_dev_idx) {
+       switch (bdev->md.meta_dev_idx) {
         case DRBD_MD_INDEX_INTERNAL:
         case DRBD_MD_INDEX_FLEX_INT:
                 s = drbd_get_capacity(bdev->backing_bdev)
                         ? min_t(sector_t, DRBD_MAX_SECTORS_FLEX,
-                               _drbd_md_first_sector(meta_dev_idx, bdev))
+                               drbd_md_first_sector(bdev))
                         : 0;
                 break;
         case DRBD_MD_INDEX_FLEX_EXT:
@@ -1848,39 +1864,24 @@ static inline sector_t drbd_get_max_capacity(struct drbd_backing_dev *bdev)
  }
  
  /**
- * drbd_md_ss__() - Return the sector number of our meta data super block
- * @mdev:      DRBD device.
+ * drbd_md_ss() - Return the sector number of our meta data super block
   * @bdev:      Meta data block device.
   */
-static inline sector_t drbd_md_ss__(struct drbd_conf *mdev,
-                                   struct drbd_backing_dev *bdev)
+static inline sector_t drbd_md_ss(struct drbd_backing_dev *bdev)
  {
-       int meta_dev_idx;
+       const int meta_dev_idx = bdev->md.meta_dev_idx;
  
-       rcu_read_lock();
-       meta_dev_idx = rcu_dereference(bdev->disk_conf)->meta_dev_idx;
-       rcu_read_unlock();
-
-       switch (meta_dev_idx) {
-       default: /* external, some index */
-               return MD_RESERVED_SECT * meta_dev_idx;
-       case DRBD_MD_INDEX_INTERNAL:
-               /* with drbd08, internal meta data is always "flexible" */
-       case DRBD_MD_INDEX_FLEX_INT:
-               /* sizeof(struct md_on_disk_07) == 4k
-                * position: last 4k aligned block of 4k size */
-               if (!bdev->backing_bdev) {
-                       if (__ratelimit(&drbd_ratelimit_state)) {
-                               dev_err(DEV, "bdev->backing_bdev==NULL\n");
-                               dump_stack();
-                       }
-                       return 0;
-               }
-               return (drbd_get_capacity(bdev->backing_bdev) & ~7ULL)
-                       - MD_AL_OFFSET;
-       case DRBD_MD_INDEX_FLEX_EXT:
+       if (meta_dev_idx == DRBD_MD_INDEX_FLEX_EXT)
                 return 0;
-       }
+
+       /* Since drbd08, internal meta data is always "flexible".
+        * position: last 4k aligned block of 4k size */
+       if (meta_dev_idx == DRBD_MD_INDEX_INTERNAL ||
+           meta_dev_idx == DRBD_MD_INDEX_FLEX_INT)
+               return (drbd_get_capacity(bdev->backing_bdev) & ~7ULL) - 8;
+
+       /* external, some index; this is the old fixed size layout */
+       return MD_128MB_SECT * bdev->md.meta_dev_idx;
  }
  
  static inline void
@@ -2053,9 +2054,11 @@ static inline void put_ldev(struct drbd_conf *mdev)
                 if (mdev->state.disk == D_DISKLESS)
                         /* even internal references gone, safe to destroy */
                         drbd_ldev_destroy(mdev);
-               if (mdev->state.disk == D_FAILED)
+               if (mdev->state.disk == D_FAILED) {
                         /* all application IO references gone. */
-                       drbd_go_diskless(mdev);
+                       if (!test_and_set_bit(GO_DISKLESS, &mdev->flags))
+                               drbd_queue_work(&mdev->tconn->sender_work, &mdev->go_diskless);
+               }
                 wake_up(&mdev->misc_wait);
         }
  }
diff --git a/drivers/block/drbd/drbd_main.c b/drivers/block/drbd/drbd_main.c

index e98da675f0c17b5085747d4b40a0ebe98a97f174..a5dca6affcbb894dc2c5d97c63c84f8972503d4e 100644 (file)
--- a/drivers/block/drbd/drbd_main.c
+++ b/drivers/block/drbd/drbd_main.c
@@ -45,7 +45,7 @@
  #include <linux/reboot.h>
  #include <linux/notifier.h>
  #include <linux/kthread.h>
-
+#include <linux/workqueue.h>
  #define __KERNEL_SYSCALLS__
  #include <linux/unistd.h>
  #include <linux/vmalloc.h>
@@ -63,7 +63,7 @@ int drbd_asender(struct drbd_thread *);
  
  int drbd_init(void);
  static int drbd_open(struct block_device *bdev, fmode_t mode);
-static int drbd_release(struct gendisk *gd, fmode_t mode);
+static void drbd_release(struct gendisk *gd, fmode_t mode);
  static int w_md_sync(struct drbd_work *w, int unused);
  static void md_sync_timer_fn(unsigned long data);
  static int w_bitmap_io(struct drbd_work *w, int unused);
@@ -1849,13 +1849,12 @@ static int drbd_open(struct block_device *bdev, fmode_t mode)
         return rv;
  }
  
-static int drbd_release(struct gendisk *gd, fmode_t mode)
+static void drbd_release(struct gendisk *gd, fmode_t mode)
  {
         struct drbd_conf *mdev = gd->private_data;
         mutex_lock(&drbd_main_mutex);
         mdev->open_cnt--;
         mutex_unlock(&drbd_main_mutex);
-       return 0;
  }
  
  static void drbd_set_defaults(struct drbd_conf *mdev)
@@ -2300,6 +2299,7 @@ static void drbd_cleanup(void)
         idr_for_each_entry(&minors, mdev, i) {
                 idr_remove(&minors, mdev_to_minor(mdev));
                 idr_remove(&mdev->tconn->volumes, mdev->vnr);
+               destroy_workqueue(mdev->submit.wq);
                 del_gendisk(mdev->vdisk);
                 /* synchronize_rcu(); No other threads running at this point */
                 kref_put(&mdev->kref, &drbd_minor_destroy);
@@ -2589,6 +2589,21 @@ void conn_destroy(struct kref *kref)
         kfree(tconn);
  }
  
+int init_submitter(struct drbd_conf *mdev)
+{
+       /* opencoded create_singlethread_workqueue(),
+        * to be able to say "drbd%d", ..., minor */
+       mdev->submit.wq = alloc_workqueue("drbd%u_submit",
+                       WQ_UNBOUND | WQ_MEM_RECLAIM, 1, mdev->minor);
+       if (!mdev->submit.wq)
+               return -ENOMEM;
+
+       INIT_WORK(&mdev->submit.worker, do_submit);
+       spin_lock_init(&mdev->submit.lock);
+       INIT_LIST_HEAD(&mdev->submit.writes);
+       return 0;
+}
+
  enum drbd_ret_code conn_new_minor(struct drbd_tconn *tconn, unsigned int minor, int vnr)
  {
         struct drbd_conf *mdev;
@@ -2678,6 +2693,12 @@ enum drbd_ret_code conn_new_minor(struct drbd_tconn *tconn, unsigned int minor,
                 goto out_idr_remove_minor;
         }
  
+       if (init_submitter(mdev)) {
+               err = ERR_NOMEM;
+               drbd_msg_put_info("unable to create submit workqueue");
+               goto out_idr_remove_vol;
+       }
+
         add_disk(disk);
         kref_init(&mdev->kref); /* one ref for both idrs and the the add_disk */
  
@@ -2688,6 +2709,8 @@ enum drbd_ret_code conn_new_minor(struct drbd_tconn *tconn, unsigned int minor,
  
         return NO_ERROR;
  
+out_idr_remove_vol:
+       idr_remove(&tconn->volumes, vnr_got);
  out_idr_remove_minor:
         idr_remove(&minors, minor_got);
         synchronize_rcu();
@@ -2795,6 +2818,7 @@ void drbd_free_bc(struct drbd_backing_dev *ldev)
         blkdev_put(ldev->backing_bdev, FMODE_READ | FMODE_WRITE | FMODE_EXCL);
         blkdev_put(ldev->md_bdev, FMODE_READ | FMODE_WRITE | FMODE_EXCL);
  
+       kfree(ldev->disk_conf);
         kfree(ldev);
  }
  
@@ -2834,8 +2858,9 @@ void conn_md_sync(struct drbd_tconn *tconn)
         rcu_read_unlock();
  }
  
+/* aligned 4kByte */
  struct meta_data_on_disk {
-       u64 la_size;           /* last agreed size. */
+       u64 la_size_sect;      /* last agreed size. */
         u64 uuid[UI_SIZE];   /* UUIDs. */
         u64 device_uuid;
         u64 reserved_u64_1;
@@ -2843,13 +2868,17 @@ struct meta_data_on_disk {
         u32 magic;
         u32 md_size_sect;
         u32 al_offset;         /* offset to this block */
-       u32 al_nr_extents;     /* important for restoring the AL */
+       u32 al_nr_extents;     /* important for restoring the AL (userspace) */
               /* `-- act_log->nr_elements <-- ldev->dc.al_extents */
         u32 bm_offset;         /* offset to the bitmap, from here */
         u32 bm_bytes_per_bit;  /* BM_BLOCK_SIZE */
         u32 la_peer_max_bio_size;   /* last peer max_bio_size */
-       u32 reserved_u32[3];
  
+       /* see al_tr_number_to_on_disk_sector() */
+       u32 al_stripes;
+       u32 al_stripe_size_4k;
+
+       u8 reserved_u8[4096 - (7*8 + 10*4)];
  } __packed;
  
  /**
@@ -2862,6 +2891,10 @@ void drbd_md_sync(struct drbd_conf *mdev)
         sector_t sector;
         int i;
  
+       /* Don't accidentally change the DRBD meta data layout. */
+       BUILD_BUG_ON(UI_SIZE != 4);
+       BUILD_BUG_ON(sizeof(struct meta_data_on_disk) != 4096);
+
         del_timer(&mdev->md_sync_timer);
         /* timer may be rearmed by drbd_md_mark_dirty() now. */
         if (!test_and_clear_bit(MD_DIRTY, &mdev->flags))
@@ -2876,9 +2909,9 @@ void drbd_md_sync(struct drbd_conf *mdev)
         if (!buffer)
                 goto out;
  
-       memset(buffer, 0, 512);
+       memset(buffer, 0, sizeof(*buffer));
  
-       buffer->la_size = cpu_to_be64(drbd_get_capacity(mdev->this_bdev));
+       buffer->la_size_sect = cpu_to_be64(drbd_get_capacity(mdev->this_bdev));
         for (i = UI_CURRENT; i < UI_SIZE; i++)
                 buffer->uuid[i] = cpu_to_be64(mdev->ldev->md.uuid[i]);
         buffer->flags = cpu_to_be32(mdev->ldev->md.flags);
@@ -2893,7 +2926,10 @@ void drbd_md_sync(struct drbd_conf *mdev)
         buffer->bm_offset = cpu_to_be32(mdev->ldev->md.bm_offset);
         buffer->la_peer_max_bio_size = cpu_to_be32(mdev->peer_max_bio_size);
  
-       D_ASSERT(drbd_md_ss__(mdev, mdev->ldev) == mdev->ldev->md.md_offset);
+       buffer->al_stripes = cpu_to_be32(mdev->ldev->md.al_stripes);
+       buffer->al_stripe_size_4k = cpu_to_be32(mdev->ldev->md.al_stripe_size_4k);
+
+       D_ASSERT(drbd_md_ss(mdev->ldev) == mdev->ldev->md.md_offset);
         sector = mdev->ldev->md.md_offset;
  
         if (drbd_md_sync_page_io(mdev, mdev->ldev, sector, WRITE)) {
@@ -2911,13 +2947,141 @@ out:
         put_ldev(mdev);
  }
  
+static int check_activity_log_stripe_size(struct drbd_conf *mdev,
+               struct meta_data_on_disk *on_disk,
+               struct drbd_md *in_core)
+{
+       u32 al_stripes = be32_to_cpu(on_disk->al_stripes);
+       u32 al_stripe_size_4k = be32_to_cpu(on_disk->al_stripe_size_4k);
+       u64 al_size_4k;
+
+       /* both not set: default to old fixed size activity log */
+       if (al_stripes == 0 && al_stripe_size_4k == 0) {
+               al_stripes = 1;
+               al_stripe_size_4k = MD_32kB_SECT/8;
+       }
+
+       /* some paranoia plausibility checks */
+
+       /* we need both values to be set */
+       if (al_stripes == 0 || al_stripe_size_4k == 0)
+               goto err;
+
+       al_size_4k = (u64)al_stripes * al_stripe_size_4k;
+
+       /* Upper limit of activity log area, to avoid potential overflow
+        * problems in al_tr_number_to_on_disk_sector(). As right now, more
+        * than 72 * 4k blocks total only increases the amount of history,
+        * limiting this arbitrarily to 16 GB is not a real limitation ;-)  */
+       if (al_size_4k > (16 * 1024 * 1024/4))
+               goto err;
+
+       /* Lower limit: we need at least 8 transaction slots (32kB)
+        * to not break existing setups */
+       if (al_size_4k < MD_32kB_SECT/8)
+               goto err;
+
+       in_core->al_stripe_size_4k = al_stripe_size_4k;
+       in_core->al_stripes = al_stripes;
+       in_core->al_size_4k = al_size_4k;
+
+       return 0;
+err:
+       dev_err(DEV, "invalid activity log striping: al_stripes=%u, al_stripe_size_4k=%u\n",
+                       al_stripes, al_stripe_size_4k);
+       return -EINVAL;
+}
+
+static int check_offsets_and_sizes(struct drbd_conf *mdev, struct drbd_backing_dev *bdev)
+{
+       sector_t capacity = drbd_get_capacity(bdev->md_bdev);
+       struct drbd_md *in_core = &bdev->md;
+       s32 on_disk_al_sect;
+       s32 on_disk_bm_sect;
+
+       /* The on-disk size of the activity log, calculated from offsets, and
+        * the size of the activity log calculated from the stripe settings,
+        * should match.
+        * Though we could relax this a bit: it is ok, if the striped activity log
+        * fits in the available on-disk activity log size.
+        * Right now, that would break how resize is implemented.
+        * TODO: make drbd_determine_dev_size() (and the drbdmeta tool) aware
+        * of possible unused padding space in the on disk layout. */
+       if (in_core->al_offset < 0) {
+               if (in_core->bm_offset > in_core->al_offset)
+                       goto err;
+               on_disk_al_sect = -in_core->al_offset;
+               on_disk_bm_sect = in_core->al_offset - in_core->bm_offset;
+       } else {
+               if (in_core->al_offset != MD_4kB_SECT)
+                       goto err;
+               if (in_core->bm_offset < in_core->al_offset + in_core->al_size_4k * MD_4kB_SECT)
+                       goto err;
+
+               on_disk_al_sect = in_core->bm_offset - MD_4kB_SECT;
+               on_disk_bm_sect = in_core->md_size_sect - in_core->bm_offset;
+       }
+
+       /* old fixed size meta data is exactly that: fixed. */
+       if (in_core->meta_dev_idx >= 0) {
+               if (in_core->md_size_sect != MD_128MB_SECT
+               ||  in_core->al_offset != MD_4kB_SECT
+               ||  in_core->bm_offset != MD_4kB_SECT + MD_32kB_SECT
+               ||  in_core->al_stripes != 1
+               ||  in_core->al_stripe_size_4k != MD_32kB_SECT/8)
+                       goto err;
+       }
+
+       if (capacity < in_core->md_size_sect)
+               goto err;
+       if (capacity - in_core->md_size_sect < drbd_md_first_sector(bdev))
+               goto err;
+
+       /* should be aligned, and at least 32k */
+       if ((on_disk_al_sect & 7) || (on_disk_al_sect < MD_32kB_SECT))
+               goto err;
+
+       /* should fit (for now: exactly) into the available on-disk space;
+        * overflow prevention is in check_activity_log_stripe_size() above. */
+       if (on_disk_al_sect != in_core->al_size_4k * MD_4kB_SECT)
+               goto err;
+
+       /* again, should be aligned */
+       if (in_core->bm_offset & 7)
+               goto err;
+
+       /* FIXME check for device grow with flex external meta data? */
+
+       /* can the available bitmap space cover the last agreed device size? */
+       if (on_disk_bm_sect < (in_core->la_size_sect+7)/MD_4kB_SECT/8/512)
+               goto err;
+
+       return 0;
+
+err:
+       dev_err(DEV, "meta data offsets don't make sense: idx=%d "
+                       "al_s=%u, al_sz4k=%u, al_offset=%d, bm_offset=%d, "
+                       "md_size_sect=%u, la_size=%llu, md_capacity=%llu\n",
+                       in_core->meta_dev_idx,
+                       in_core->al_stripes, in_core->al_stripe_size_4k,
+                       in_core->al_offset, in_core->bm_offset, in_core->md_size_sect,
+                       (unsigned long long)in_core->la_size_sect,
+                       (unsigned long long)capacity);
+
+       return -EINVAL;
+}
+
+
  /**
   * drbd_md_read() - Reads in the meta data super block
   * @mdev:      DRBD device.
   * @bdev:      Device from which the meta data should be read in.
   *
- * Return 0 (NO_ERROR) on success, and an enum drbd_ret_code in case
+ * Return NO_ERROR on success, and an enum drbd_ret_code in case
   * something goes wrong.
+ *
+ * Called exactly once during drbd_adm_attach(), while still being D_DISKLESS,
+ * even before @bdev is assigned to @mdev->ldev.
   */
  int drbd_md_read(struct drbd_conf *mdev, struct drbd_backing_dev *bdev)
  {
@@ -2925,12 +3089,17 @@ int drbd_md_read(struct drbd_conf *mdev, struct drbd_backing_dev *bdev)
         u32 magic, flags;
         int i, rv = NO_ERROR;
  
-       if (!get_ldev_if_state(mdev, D_ATTACHING))
-               return ERR_IO_MD_DISK;
+       if (mdev->state.disk != D_DISKLESS)
+               return ERR_DISK_CONFIGURED;
  
         buffer = drbd_md_get_buffer(mdev);
         if (!buffer)
-               goto out;
+               return ERR_NOMEM;
+
+       /* First, figure out where our meta data superblock is located,
+        * and read it. */
+       bdev->md.meta_dev_idx = bdev->disk_conf->meta_dev_idx;
+       bdev->md.md_offset = drbd_md_ss(bdev);
  
         if (drbd_md_sync_page_io(mdev, bdev, bdev->md.md_offset, READ)) {
                 /* NOTE: can't do normal error processing here as this is
@@ -2949,45 +3118,51 @@ int drbd_md_read(struct drbd_conf *mdev, struct drbd_backing_dev *bdev)
                 rv = ERR_MD_UNCLEAN;
                 goto err;
         }
+
+       rv = ERR_MD_INVALID;
         if (magic != DRBD_MD_MAGIC_08) {
                 if (magic == DRBD_MD_MAGIC_07)
                         dev_err(DEV, "Found old (0.7) meta data magic. Did you \"drbdadm create-md\"?\n");
                 else
                         dev_err(DEV, "Meta data magic not found. Did you \"drbdadm create-md\"?\n");
-               rv = ERR_MD_INVALID;
                 goto err;
         }
-       if (be32_to_cpu(buffer->al_offset) != bdev->md.al_offset) {
-               dev_err(DEV, "unexpected al_offset: %d (expected %d)\n",
-                   be32_to_cpu(buffer->al_offset), bdev->md.al_offset);
-               rv = ERR_MD_INVALID;
+
+       if (be32_to_cpu(buffer->bm_bytes_per_bit) != BM_BLOCK_SIZE) {
+               dev_err(DEV, "unexpected bm_bytes_per_bit: %u (expected %u)\n",
+                   be32_to_cpu(buffer->bm_bytes_per_bit), BM_BLOCK_SIZE);
                 goto err;
         }
+
+
+       /* convert to in_core endian */
+       bdev->md.la_size_sect = be64_to_cpu(buffer->la_size_sect);
+       for (i = UI_CURRENT; i < UI_SIZE; i++)
+               bdev->md.uuid[i] = be64_to_cpu(buffer->uuid[i]);
+       bdev->md.flags = be32_to_cpu(buffer->flags);
+       bdev->md.device_uuid = be64_to_cpu(buffer->device_uuid);
+
+       bdev->md.md_size_sect = be32_to_cpu(buffer->md_size_sect);
+       bdev->md.al_offset = be32_to_cpu(buffer->al_offset);
+       bdev->md.bm_offset = be32_to_cpu(buffer->bm_offset);
+
+       if (check_activity_log_stripe_size(mdev, buffer, &bdev->md))
+               goto err;
+       if (check_offsets_and_sizes(mdev, bdev))
+               goto err;
+
         if (be32_to_cpu(buffer->bm_offset) != bdev->md.bm_offset) {
                 dev_err(DEV, "unexpected bm_offset: %d (expected %d)\n",
                     be32_to_cpu(buffer->bm_offset), bdev->md.bm_offset);
-               rv = ERR_MD_INVALID;
                 goto err;
         }
         if (be32_to_cpu(buffer->md_size_sect) != bdev->md.md_size_sect) {
                 dev_err(DEV, "unexpected md_size: %u (expected %u)\n",
                     be32_to_cpu(buffer->md_size_sect), bdev->md.md_size_sect);
-               rv = ERR_MD_INVALID;
                 goto err;
         }
  
-       if (be32_to_cpu(buffer->bm_bytes_per_bit) != BM_BLOCK_SIZE) {
-               dev_err(DEV, "unexpected bm_bytes_per_bit: %u (expected %u)\n",
-                   be32_to_cpu(buffer->bm_bytes_per_bit), BM_BLOCK_SIZE);
-               rv = ERR_MD_INVALID;
-               goto err;
-       }
-
-       bdev->md.la_size_sect = be64_to_cpu(buffer->la_size);
-       for (i = UI_CURRENT; i < UI_SIZE; i++)
-               bdev->md.uuid[i] = be64_to_cpu(buffer->uuid[i]);
-       bdev->md.flags = be32_to_cpu(buffer->flags);
-       bdev->md.device_uuid = be64_to_cpu(buffer->device_uuid);
+       rv = NO_ERROR;
  
         spin_lock_irq(&mdev->tconn->req_lock);
         if (mdev->state.conn < C_CONNECTED) {
@@ -3000,8 +3175,6 @@ int drbd_md_read(struct drbd_conf *mdev, struct drbd_backing_dev *bdev)
  
   err:
         drbd_md_put_buffer(mdev);
- out:
-       put_ldev(mdev);
  
         return rv;
  }
@@ -3239,8 +3412,12 @@ static int w_go_diskless(struct drbd_work *w, int unused)
          * end up here after a failed attach, before ldev was even assigned.
          */
         if (mdev->bitmap && mdev->ldev) {
+               /* An interrupted resync or similar is allowed to recounts bits
+                * while we detach.
+                * Any modifications would not be expected anymore, though.
+                */
                 if (drbd_bitmap_io_from_worker(mdev, drbd_bm_write,
-                                       "detach", BM_LOCKED_MASK)) {
+                                       "detach", BM_LOCKED_TEST_ALLOWED)) {
                         if (test_bit(WAS_READ_ERROR, &mdev->flags)) {
                                 drbd_md_set_flag(mdev, MDF_FULL_SYNC);
                                 drbd_md_sync(mdev);
@@ -3252,13 +3429,6 @@ static int w_go_diskless(struct drbd_work *w, int unused)
         return 0;
  }
  
-void drbd_go_diskless(struct drbd_conf *mdev)
-{
-       D_ASSERT(mdev->state.disk == D_FAILED);
-       if (!test_and_set_bit(GO_DISKLESS, &mdev->flags))
-               drbd_queue_work(&mdev->tconn->sender_work, &mdev->go_diskless);
-}
-
  /**
   * drbd_queue_bitmap_io() - Queues an IO operation on the whole bitmap
   * @mdev:      DRBD device.
diff --git a/drivers/block/drbd/drbd_nl.c b/drivers/block/drbd/drbd_nl.c

index 2af26fc9528083d23cbde63fbe42128a26757599..9e3f441e7e8441e83d61273ed0f34f1c309c518b 100644 (file)
--- a/drivers/block/drbd/drbd_nl.c
+++ b/drivers/block/drbd/drbd_nl.c
@@ -696,37 +696,52 @@ out:
         return 0;
  }
  
-/* initializes the md.*_offset members, so we are able to find
- * the on disk meta data */
+/* Initializes the md.*_offset members, so we are able to find
+ * the on disk meta data.
+ *
+ * We currently have two possible layouts:
+ * external:
+ *   |----------- md_size_sect ------------------|
+ *   [ 4k superblock ][ activity log ][  Bitmap  ]
+ *   | al_offset == 8 |
+ *   | bm_offset = al_offset + X      |
+ *  ==> bitmap sectors = md_size_sect - bm_offset
+ *
+ * internal:
+ *            |----------- md_size_sect ------------------|
+ * [data.....][  Bitmap  ][ activity log ][ 4k superblock ]
+ *                        | al_offset < 0 |
+ *            | bm_offset = al_offset - Y |
+ *  ==> bitmap sectors = Y = al_offset - bm_offset
+ *
+ *  Activity log size used to be fixed 32kB,
+ *  but is about to become configurable.
+ */
  static void drbd_md_set_sector_offsets(struct drbd_conf *mdev,
                                        struct drbd_backing_dev *bdev)
  {
         sector_t md_size_sect = 0;
-       int meta_dev_idx;
+       unsigned int al_size_sect = bdev->md.al_size_4k * 8;
  
-       rcu_read_lock();
-       meta_dev_idx = rcu_dereference(bdev->disk_conf)->meta_dev_idx;
+       bdev->md.md_offset = drbd_md_ss(bdev);
  
-       switch (meta_dev_idx) {
+       switch (bdev->md.meta_dev_idx) {
         default:
                 /* v07 style fixed size indexed meta data */
-               bdev->md.md_size_sect = MD_RESERVED_SECT;
-               bdev->md.md_offset = drbd_md_ss__(mdev, bdev);
-               bdev->md.al_offset = MD_AL_OFFSET;
-               bdev->md.bm_offset = MD_BM_OFFSET;
+               bdev->md.md_size_sect = MD_128MB_SECT;
+               bdev->md.al_offset = MD_4kB_SECT;
+               bdev->md.bm_offset = MD_4kB_SECT + al_size_sect;
                 break;
         case DRBD_MD_INDEX_FLEX_EXT:
                 /* just occupy the full device; unit: sectors */
                 bdev->md.md_size_sect = drbd_get_capacity(bdev->md_bdev);
-               bdev->md.md_offset = 0;
-               bdev->md.al_offset = MD_AL_OFFSET;
-               bdev->md.bm_offset = MD_BM_OFFSET;
+               bdev->md.al_offset = MD_4kB_SECT;
+               bdev->md.bm_offset = MD_4kB_SECT + al_size_sect;
                 break;
         case DRBD_MD_INDEX_INTERNAL:
         case DRBD_MD_INDEX_FLEX_INT:
-               bdev->md.md_offset = drbd_md_ss__(mdev, bdev);
                 /* al size is still fixed */
-               bdev->md.al_offset = -MD_AL_SECTORS;
+               bdev->md.al_offset = -al_size_sect;
                 /* we need (slightly less than) ~ this much bitmap sectors: */
                 md_size_sect = drbd_get_capacity(bdev->backing_bdev);
                 md_size_sect = ALIGN(md_size_sect, BM_SECT_PER_EXT);
@@ -735,14 +750,13 @@ static void drbd_md_set_sector_offsets(struct drbd_conf *mdev,
  
                 /* plus the "drbd meta data super block",
                  * and the activity log; */
-               md_size_sect += MD_BM_OFFSET;
+               md_size_sect += MD_4kB_SECT + al_size_sect;
  
                 bdev->md.md_size_sect = md_size_sect;
                 /* bitmap offset is adjusted by 'super' block size */
-               bdev->md.bm_offset   = -md_size_sect + MD_AL_OFFSET;
+               bdev->md.bm_offset   = -md_size_sect + MD_4kB_SECT;
                 break;
         }
-       rcu_read_unlock();
  }
  
  /* input size is expected to be in KB */
@@ -805,7 +819,7 @@ void drbd_resume_io(struct drbd_conf *mdev)
  enum determine_dev_size drbd_determine_dev_size(struct drbd_conf *mdev, enum dds_flags flags) __must_hold(local)
  {
         sector_t prev_first_sect, prev_size; /* previous meta location */
-       sector_t la_size, u_size;
+       sector_t la_size_sect, u_size;
         sector_t size;
         char ppb[10];
  
@@ -828,7 +842,7 @@ enum determine_dev_size drbd_determine_dev_size(struct drbd_conf *mdev, enum dds
  
         prev_first_sect = drbd_md_first_sector(mdev->ldev);
         prev_size = mdev->ldev->md.md_size_sect;
-       la_size = mdev->ldev->md.la_size_sect;
+       la_size_sect = mdev->ldev->md.la_size_sect;
  
         /* TODO: should only be some assert here, not (re)init... */
         drbd_md_set_sector_offsets(mdev, mdev->ldev);
@@ -864,7 +878,7 @@ enum determine_dev_size drbd_determine_dev_size(struct drbd_conf *mdev, enum dds
         if (rv == dev_size_error)
                 goto out;
  
-       la_size_changed = (la_size != mdev->ldev->md.la_size_sect);
+       la_size_changed = (la_size_sect != mdev->ldev->md.la_size_sect);
  
         md_moved = prev_first_sect != drbd_md_first_sector(mdev->ldev)
                 || prev_size       != mdev->ldev->md.md_size_sect;
@@ -886,9 +900,9 @@ enum determine_dev_size drbd_determine_dev_size(struct drbd_conf *mdev, enum dds
                 drbd_md_mark_dirty(mdev);
         }
  
-       if (size > la_size)
+       if (size > la_size_sect)
                 rv = grew;
-       if (size < la_size)
+       if (size < la_size_sect)
                 rv = shrunk;
  out:
         lc_unlock(mdev->act_log);
@@ -903,7 +917,7 @@ drbd_new_dev_size(struct drbd_conf *mdev, struct drbd_backing_dev *bdev,
                   sector_t u_size, int assume_peer_has_space)
  {
         sector_t p_size = mdev->p_size;   /* partner's disk size. */
-       sector_t la_size = bdev->md.la_size_sect; /* last agreed size. */
+       sector_t la_size_sect = bdev->md.la_size_sect; /* last agreed size. */
         sector_t m_size; /* my size */
         sector_t size = 0;
  
@@ -917,8 +931,8 @@ drbd_new_dev_size(struct drbd_conf *mdev, struct drbd_backing_dev *bdev,
         if (p_size && m_size) {
                 size = min_t(sector_t, p_size, m_size);
         } else {
-               if (la_size) {
-                       size = la_size;
+               if (la_size_sect) {
+                       size = la_size_sect;
                         if (m_size && m_size < size)
                                 size = m_size;
                         if (p_size && p_size < size)
@@ -1127,15 +1141,32 @@ static bool should_set_defaults(struct genl_info *info)
         return 0 != (flags & DRBD_GENL_F_SET_DEFAULTS);
  }
  
-static void enforce_disk_conf_limits(struct disk_conf *dc)
+static unsigned int drbd_al_extents_max(struct drbd_backing_dev *bdev)
  {
-       if (dc->al_extents < DRBD_AL_EXTENTS_MIN)
-               dc->al_extents = DRBD_AL_EXTENTS_MIN;
-       if (dc->al_extents > DRBD_AL_EXTENTS_MAX)
-               dc->al_extents = DRBD_AL_EXTENTS_MAX;
+       /* This is limited by 16 bit "slot" numbers,
+        * and by available on-disk context storage.
+        *
+        * Also (u16)~0 is special (denotes a "free" extent).
+        *
+        * One transaction occupies one 4kB on-disk block,
+        * we have n such blocks in the on disk ring buffer,
+        * the "current" transaction may fail (n-1),
+        * and there is 919 slot numbers context information per transaction.
+        *
+        * 72 transaction blocks amounts to more than 2**16 context slots,
+        * so cap there first.
+        */
+       const unsigned int max_al_nr = DRBD_AL_EXTENTS_MAX;
+       const unsigned int sufficient_on_disk =
+               (max_al_nr + AL_CONTEXT_PER_TRANSACTION -1)
+               /AL_CONTEXT_PER_TRANSACTION;
+
+       unsigned int al_size_4k = bdev->md.al_size_4k;
+
+       if (al_size_4k > sufficient_on_disk)
+               return max_al_nr;
  
-       if (dc->c_plan_ahead > DRBD_C_PLAN_AHEAD_MAX)
-               dc->c_plan_ahead = DRBD_C_PLAN_AHEAD_MAX;
+       return (al_size_4k - 1) * AL_CONTEXT_PER_TRANSACTION;
  }
  
  int drbd_adm_disk_opts(struct sk_buff *skb, struct genl_info *info)
@@ -1182,7 +1213,13 @@ int drbd_adm_disk_opts(struct sk_buff *skb, struct genl_info *info)
         if (!expect(new_disk_conf->resync_rate >= 1))
                 new_disk_conf->resync_rate = 1;
  
-       enforce_disk_conf_limits(new_disk_conf);
+       if (new_disk_conf->al_extents < DRBD_AL_EXTENTS_MIN)
+               new_disk_conf->al_extents = DRBD_AL_EXTENTS_MIN;
+       if (new_disk_conf->al_extents > drbd_al_extents_max(mdev->ldev))
+               new_disk_conf->al_extents = drbd_al_extents_max(mdev->ldev);
+
+       if (new_disk_conf->c_plan_ahead > DRBD_C_PLAN_AHEAD_MAX)
+               new_disk_conf->c_plan_ahead = DRBD_C_PLAN_AHEAD_MAX;
  
         fifo_size = (new_disk_conf->c_plan_ahead * 10 * SLEEP_TIME) / HZ;
         if (fifo_size != mdev->rs_plan_s->size) {
@@ -1330,7 +1367,8 @@ int drbd_adm_attach(struct sk_buff *skb, struct genl_info *info)
                 goto fail;
         }
  
-       enforce_disk_conf_limits(new_disk_conf);
+       if (new_disk_conf->c_plan_ahead > DRBD_C_PLAN_AHEAD_MAX)
+               new_disk_conf->c_plan_ahead = DRBD_C_PLAN_AHEAD_MAX;
  
         new_plan = fifo_alloc((new_disk_conf->c_plan_ahead * 10 * SLEEP_TIME) / HZ);
         if (!new_plan) {
@@ -1343,6 +1381,12 @@ int drbd_adm_attach(struct sk_buff *skb, struct genl_info *info)
                 goto fail;
         }
  
+       write_lock_irq(&global_state_lock);
+       retcode = drbd_resync_after_valid(mdev, new_disk_conf->resync_after);
+       write_unlock_irq(&global_state_lock);
+       if (retcode != NO_ERROR)
+               goto fail;
+
         rcu_read_lock();
         nc = rcu_dereference(mdev->tconn->net_conf);
         if (nc) {
@@ -1399,8 +1443,16 @@ int drbd_adm_attach(struct sk_buff *skb, struct genl_info *info)
                 goto fail;
         }
  
-       /* RT - for drbd_get_max_capacity() DRBD_MD_INDEX_FLEX_INT */
-       drbd_md_set_sector_offsets(mdev, nbc);
+       /* Read our meta data super block early.
+        * This also sets other on-disk offsets. */
+       retcode = drbd_md_read(mdev, nbc);
+       if (retcode != NO_ERROR)
+               goto fail;
+
+       if (new_disk_conf->al_extents < DRBD_AL_EXTENTS_MIN)
+               new_disk_conf->al_extents = DRBD_AL_EXTENTS_MIN;
+       if (new_disk_conf->al_extents > drbd_al_extents_max(nbc))
+               new_disk_conf->al_extents = drbd_al_extents_max(nbc);
  
         if (drbd_get_max_capacity(nbc) < new_disk_conf->disk_size) {
                 dev_err(DEV, "max capacity %llu smaller than disk size %llu\n",
@@ -1416,7 +1468,7 @@ int drbd_adm_attach(struct sk_buff *skb, struct genl_info *info)
                 min_md_device_sectors = (2<<10);
         } else {
                 max_possible_sectors = DRBD_MAX_SECTORS;
-               min_md_device_sectors = MD_RESERVED_SECT * (new_disk_conf->meta_dev_idx + 1);
+               min_md_device_sectors = MD_128MB_SECT * (new_disk_conf->meta_dev_idx + 1);
         }
  
         if (drbd_get_capacity(nbc->md_bdev) < min_md_device_sectors) {
@@ -1467,8 +1519,6 @@ int drbd_adm_attach(struct sk_buff *skb, struct genl_info *info)
         if (!get_ldev_if_state(mdev, D_ATTACHING))
                 goto force_diskless;
  
-       drbd_md_set_sector_offsets(mdev, nbc);
-
         if (!mdev->bitmap) {
                 if (drbd_bm_init(mdev)) {
                         retcode = ERR_NOMEM;
@@ -1476,10 +1526,6 @@ int drbd_adm_attach(struct sk_buff *skb, struct genl_info *info)
                 }
         }
  
-       retcode = drbd_md_read(mdev, nbc);
-       if (retcode != NO_ERROR)
-               goto force_diskless_dec;
-
         if (mdev->state.conn < C_CONNECTED &&
             mdev->state.role == R_PRIMARY &&
             (mdev->ed_uuid & ~((u64)1)) != (nbc->md.uuid[UI_CURRENT] & ~((u64)1))) {
@@ -2158,8 +2204,11 @@ static enum drbd_state_rv conn_try_disconnect(struct drbd_tconn *tconn, bool for
                 return SS_SUCCESS;
         case SS_PRIMARY_NOP:
                 /* Our state checking code wants to see the peer outdated. */
-               rv = conn_request_state(tconn, NS2(conn, C_DISCONNECTING,
-                                               pdsk, D_OUTDATED), CS_VERBOSE);
+               rv = conn_request_state(tconn, NS2(conn, C_DISCONNECTING, pdsk, D_OUTDATED), 0);
+
+               if (rv == SS_OUTDATE_WO_CONN) /* lost connection before graceful disconnect succeeded */
+                       rv = conn_request_state(tconn, NS(conn, C_DISCONNECTING), CS_VERBOSE);
+
                 break;
         case SS_CW_FAILED_BY_PEER:
                 /* The peer probably wants to see us outdated. */
@@ -2406,22 +2455,19 @@ int drbd_adm_invalidate(struct sk_buff *skb, struct genl_info *info)
         wait_event(mdev->misc_wait, !test_bit(BITMAP_IO, &mdev->flags));
         drbd_flush_workqueue(mdev);
  
-       retcode = _drbd_request_state(mdev, NS(conn, C_STARTING_SYNC_T), CS_ORDERED);
-
-       if (retcode < SS_SUCCESS && retcode != SS_NEED_CONNECTION)
-               retcode = drbd_request_state(mdev, NS(conn, C_STARTING_SYNC_T));
-
-       while (retcode == SS_NEED_CONNECTION) {
-               spin_lock_irq(&mdev->tconn->req_lock);
-               if (mdev->state.conn < C_CONNECTED)
-                       retcode = _drbd_set_state(_NS(mdev, disk, D_INCONSISTENT), CS_VERBOSE, NULL);
-               spin_unlock_irq(&mdev->tconn->req_lock);
-
-               if (retcode != SS_NEED_CONNECTION)
-                       break;
-
+       /* If we happen to be C_STANDALONE R_SECONDARY, just change to
+        * D_INCONSISTENT, and set all bits in the bitmap.  Otherwise,
+        * try to start a resync handshake as sync target for full sync.
+        */
+       if (mdev->state.conn == C_STANDALONE && mdev->state.role == R_SECONDARY) {
+               retcode = drbd_request_state(mdev, NS(disk, D_INCONSISTENT));
+               if (retcode >= SS_SUCCESS) {
+                       if (drbd_bitmap_io(mdev, &drbd_bmio_set_n_write,
+                               "set_n_write from invalidate", BM_LOCKED_MASK))
+                               retcode = ERR_IO_MD_DISK;
+               }
+       } else
                 retcode = drbd_request_state(mdev, NS(conn, C_STARTING_SYNC_T));
-       }
         drbd_resume_io(mdev);
  
  out:
@@ -2475,21 +2521,22 @@ int drbd_adm_invalidate_peer(struct sk_buff *skb, struct genl_info *info)
         wait_event(mdev->misc_wait, !test_bit(BITMAP_IO, &mdev->flags));
         drbd_flush_workqueue(mdev);
  
-       retcode = _drbd_request_state(mdev, NS(conn, C_STARTING_SYNC_S), CS_ORDERED);
-       if (retcode < SS_SUCCESS) {
-               if (retcode == SS_NEED_CONNECTION && mdev->state.role == R_PRIMARY) {
-                       /* The peer will get a resync upon connect anyways.
-                        * Just make that into a full resync. */
-                       retcode = drbd_request_state(mdev, NS(pdsk, D_INCONSISTENT));
-                       if (retcode >= SS_SUCCESS) {
-                               if (drbd_bitmap_io(mdev, &drbd_bmio_set_susp_al,
-                                                  "set_n_write from invalidate_peer",
-                                                  BM_LOCKED_SET_ALLOWED))
-                                       retcode = ERR_IO_MD_DISK;
-                       }
-               } else
-                       retcode = drbd_request_state(mdev, NS(conn, C_STARTING_SYNC_S));
-       }
+       /* If we happen to be C_STANDALONE R_PRIMARY, just set all bits
+        * in the bitmap.  Otherwise, try to start a resync handshake
+        * as sync source for full sync.
+        */
+       if (mdev->state.conn == C_STANDALONE && mdev->state.role == R_PRIMARY) {
+               /* The peer will get a resync upon connect anyways. Just make that
+                  into a full resync. */
+               retcode = drbd_request_state(mdev, NS(pdsk, D_INCONSISTENT));
+               if (retcode >= SS_SUCCESS) {
+                       if (drbd_bitmap_io(mdev, &drbd_bmio_set_susp_al,
+                               "set_n_write from invalidate_peer",
+                               BM_LOCKED_SET_ALLOWED))
+                               retcode = ERR_IO_MD_DISK;
+               }
+       } else
+               retcode = drbd_request_state(mdev, NS(conn, C_STARTING_SYNC_S));
         drbd_resume_io(mdev);
  
  out:
@@ -3162,6 +3209,7 @@ static enum drbd_ret_code adm_delete_minor(struct drbd_conf *mdev)
                                     CS_VERBOSE + CS_WAIT_COMPLETE);
                 idr_remove(&mdev->tconn->volumes, mdev->vnr);
                 idr_remove(&minors, mdev_to_minor(mdev));
+               destroy_workqueue(mdev->submit.wq);
                 del_gendisk(mdev->vdisk);
                 synchronize_rcu();
                 kref_put(&mdev->kref, &drbd_minor_destroy);
diff --git a/drivers/block/drbd/drbd_proc.c b/drivers/block/drbd/drbd_proc.c

index 928adb815b09f6b83fc6420faaeeaf8bd64f8404..bf31d41dbaadf023a57057bb03006c77146c7879 100644 (file)
--- a/drivers/block/drbd/drbd_proc.c
+++ b/drivers/block/drbd/drbd_proc.c
@@ -313,8 +313,14 @@ static int drbd_seq_show(struct seq_file *seq, void *v)
  
  static int drbd_proc_open(struct inode *inode, struct file *file)
  {
-       if (try_module_get(THIS_MODULE))
-               return single_open(file, drbd_seq_show, PDE_DATA(inode));
+       int err;
+
+       if (try_module_get(THIS_MODULE)) {
+               err = single_open(file, drbd_seq_show, PDE_DATA(inode));
+               if (err)
+                       module_put(THIS_MODULE);
+               return err;
+       }
         return -ENODEV;
  }
  
diff --git a/drivers/block/drbd/drbd_receiver.c b/drivers/block/drbd/drbd_receiver.c

index 83c5ae0ed56b4f1f41c7a655ad756e6f64766c26..4222affff488bc15c116187e2eb382af5f2d0508 100644 (file)
--- a/drivers/block/drbd/drbd_receiver.c
+++ b/drivers/block/drbd/drbd_receiver.c
@@ -850,6 +850,7 @@ int drbd_connected(struct drbd_conf *mdev)
                 err = drbd_send_current_state(mdev);
         clear_bit(USE_DEGR_WFC_T, &mdev->flags);
         clear_bit(RESIZE_PENDING, &mdev->flags);
+       atomic_set(&mdev->ap_in_flight, 0);
         mod_timer(&mdev->request_timer, jiffies + HZ); /* just start it here. */
         return err;
  }
@@ -2266,7 +2267,7 @@ static int receive_Data(struct drbd_tconn *tconn, struct packet_info *pi)
                 drbd_set_out_of_sync(mdev, peer_req->i.sector, peer_req->i.size);
                 peer_req->flags |= EE_CALL_AL_COMPLETE_IO;
                 peer_req->flags &= ~EE_MAY_SET_IN_SYNC;
-               drbd_al_begin_io(mdev, &peer_req->i);
+               drbd_al_begin_io(mdev, &peer_req->i, true);
         }
  
         err = drbd_submit_peer_request(mdev, peer_req, rw, DRBD_FAULT_DT_WR);
@@ -2662,7 +2663,6 @@ static int drbd_asb_recover_1p(struct drbd_conf *mdev) __must_hold(local)
                 if (hg == -1 && mdev->state.role == R_PRIMARY) {
                         enum drbd_state_rv rv2;
  
-                       drbd_set_role(mdev, R_SECONDARY, 0);
                          /* drbd_change_state() does not sleep while in SS_IN_TRANSIENT_STATE,
                           * we might be here in C_WF_REPORT_PARAMS which is transient.
                           * we do not need to wait for the after state change work either. */
@@ -3993,7 +3993,7 @@ static int receive_state(struct drbd_tconn *tconn, struct packet_info *pi)
  
         clear_bit(DISCARD_MY_DATA, &mdev->flags);
  
-       drbd_md_sync(mdev); /* update connected indicator, la_size, ... */
+       drbd_md_sync(mdev); /* update connected indicator, la_size_sect, ... */
  
         return 0;
  }
@@ -4660,8 +4660,8 @@ static int drbd_do_features(struct drbd_tconn *tconn)
  #if !defined(CONFIG_CRYPTO_HMAC) && !defined(CONFIG_CRYPTO_HMAC_MODULE)
  static int drbd_do_auth(struct drbd_tconn *tconn)
  {
-       dev_err(DEV, "This kernel was build without CONFIG_CRYPTO_HMAC.\n");
-       dev_err(DEV, "You need to disable 'cram-hmac-alg' in drbd.conf.\n");
+       conn_err(tconn, "This kernel was build without CONFIG_CRYPTO_HMAC.\n");
+       conn_err(tconn, "You need to disable 'cram-hmac-alg' in drbd.conf.\n");
         return -1;
  }
  #else
@@ -5258,9 +5258,11 @@ int drbd_asender(struct drbd_thread *thi)
         bool ping_timeout_active = false;
         struct net_conf *nc;
         int ping_timeo, tcp_cork, ping_int;
+       struct sched_param param = { .sched_priority = 2 };
  
-       current->policy = SCHED_RR;  /* Make this a realtime task! */
-       current->rt_priority = 2;    /* more important than all other tasks */
+       rv = sched_setscheduler(current, SCHED_RR, &param);
+       if (rv < 0)
+               conn_err(tconn, "drbd_asender: ERROR set priority, ret=%d\n", rv);
  
         while (get_t_state(thi) == RUNNING) {
                 drbd_thread_current_set_cpu(thi);
diff --git a/drivers/block/drbd/drbd_req.c b/drivers/block/drbd/drbd_req.c

index 2b8303ad63c97966a3913854825cee60bec5320c..c24379ffd4e309cb0344f138854a131e12cc804e 100644 (file)
--- a/drivers/block/drbd/drbd_req.c
+++ b/drivers/block/drbd/drbd_req.c
@@ -34,14 +34,14 @@
  static bool drbd_may_do_local_read(struct drbd_conf *mdev, sector_t sector, int size);
  
  /* Update disk stats at start of I/O request */
-static void _drbd_start_io_acct(struct drbd_conf *mdev, struct drbd_request *req, struct bio *bio)
+static void _drbd_start_io_acct(struct drbd_conf *mdev, struct drbd_request *req)
  {
-       const int rw = bio_data_dir(bio);
+       const int rw = bio_data_dir(req->master_bio);
         int cpu;
         cpu = part_stat_lock();
         part_round_stats(cpu, &mdev->vdisk->part0);
         part_stat_inc(cpu, &mdev->vdisk->part0, ios[rw]);
-       part_stat_add(cpu, &mdev->vdisk->part0, sectors[rw], bio_sectors(bio));
+       part_stat_add(cpu, &mdev->vdisk->part0, sectors[rw], req->i.size >> 9);
         (void) cpu; /* The macro invocations above want the cpu argument, I do not like
                        the compiler warning about cpu only assigned but never used... */
         part_inc_in_flight(&mdev->vdisk->part0, rw);
@@ -263,8 +263,7 @@ void drbd_req_complete(struct drbd_request *req, struct bio_and_error *m)
                 else
                         root = &mdev->read_requests;
                 drbd_remove_request_interval(root, req);
-       } else if (!(s & RQ_POSTPONED))
-               D_ASSERT((s & (RQ_NET_MASK & ~RQ_NET_DONE)) == 0);
+       }
  
         /* Before we can signal completion to the upper layers,
          * we may need to close the current transfer log epoch.
@@ -755,6 +754,11 @@ int __req_mod(struct drbd_request *req, enum drbd_req_event what,
                 D_ASSERT(req->rq_state & RQ_NET_PENDING);
                 mod_rq_state(req, m, RQ_NET_PENDING, RQ_NET_OK|RQ_NET_DONE);
                 break;
+
+       case QUEUE_AS_DRBD_BARRIER:
+               start_new_tl_epoch(mdev->tconn);
+               mod_rq_state(req, m, 0, RQ_NET_OK|RQ_NET_DONE);
+               break;
         };
  
         return rv;
@@ -861,8 +865,10 @@ static void maybe_pull_ahead(struct drbd_conf *mdev)
         bool congested = false;
         enum drbd_on_congestion on_congestion;
  
+       rcu_read_lock();
         nc = rcu_dereference(tconn->net_conf);
         on_congestion = nc ? nc->on_congestion : OC_BLOCK;
+       rcu_read_unlock();
         if (on_congestion == OC_BLOCK ||
             tconn->agreed_pro_version < 96)
                 return;
@@ -956,14 +962,8 @@ static int drbd_process_write_request(struct drbd_request *req)
         struct drbd_conf *mdev = req->w.mdev;
         int remote, send_oos;
  
-       rcu_read_lock();
         remote = drbd_should_do_remote(mdev->state);
-       if (remote) {
-               maybe_pull_ahead(mdev);
-               remote = drbd_should_do_remote(mdev->state);
-       }
         send_oos = drbd_should_send_out_of_sync(mdev->state);
-       rcu_read_unlock();
  
         /* Need to replicate writes.  Unless it is an empty flush,
          * which is better mapped to a DRBD P_BARRIER packet,
@@ -975,8 +975,8 @@ static int drbd_process_write_request(struct drbd_request *req)
                 /* The only size==0 bios we expect are empty flushes. */
                 D_ASSERT(req->master_bio->bi_rw & REQ_FLUSH);
                 if (remote)
-                       start_new_tl_epoch(mdev->tconn);
-               return 0;
+                       _req_mod(req, QUEUE_AS_DRBD_BARRIER);
+               return remote;
         }
  
         if (!remote && !send_oos)
@@ -1020,12 +1020,24 @@ drbd_submit_req_private_bio(struct drbd_request *req)
                 bio_endio(bio, -EIO);
  }
  
-void __drbd_make_request(struct drbd_conf *mdev, struct bio *bio, unsigned long start_time)
+static void drbd_queue_write(struct drbd_conf *mdev, struct drbd_request *req)
  {
-       const int rw = bio_rw(bio);
-       struct bio_and_error m = { NULL, };
+       spin_lock(&mdev->submit.lock);
+       list_add_tail(&req->tl_requests, &mdev->submit.writes);
+       spin_unlock(&mdev->submit.lock);
+       queue_work(mdev->submit.wq, &mdev->submit.worker);
+}
+
+/* returns the new drbd_request pointer, if the caller is expected to
+ * drbd_send_and_submit() it (to save latency), or NULL if we queued the
+ * request on the submitter thread.
+ * Returns ERR_PTR(-ENOMEM) if we cannot allocate a drbd_request.
+ */
+struct drbd_request *
+drbd_request_prepare(struct drbd_conf *mdev, struct bio *bio, unsigned long start_time)
+{
+       const int rw = bio_data_dir(bio);
         struct drbd_request *req;
-       bool no_remote = false;
  
         /* allocate outside of all locks; */
         req = drbd_req_new(mdev, bio);
@@ -1035,7 +1047,7 @@ void __drbd_make_request(struct drbd_conf *mdev, struct bio *bio, unsigned long
                  * if user cannot handle io errors, that's not our business. */
                 dev_err(DEV, "could not kmalloc() req\n");
                 bio_endio(bio, -ENOMEM);
-               return;
+               return ERR_PTR(-ENOMEM);
         }
         req->start_time = start_time;
  
@@ -1044,28 +1056,40 @@ void __drbd_make_request(struct drbd_conf *mdev, struct bio *bio, unsigned long
                 req->private_bio = NULL;
         }
  
-       /* For WRITES going to the local disk, grab a reference on the target
-        * extent.  This waits for any resync activity in the corresponding
-        * resync extent to finish, and, if necessary, pulls in the target
-        * extent into the activity log, which involves further disk io because
-        * of transactional on-disk meta data updates.
-        * Empty flushes don't need to go into the activity log, they can only
-        * flush data for pending writes which are already in there. */
+       /* Update disk stats */
+       _drbd_start_io_acct(mdev, req);
+
         if (rw == WRITE && req->private_bio && req->i.size
         && !test_bit(AL_SUSPENDED, &mdev->flags)) {
+               if (!drbd_al_begin_io_fastpath(mdev, &req->i)) {
+                       drbd_queue_write(mdev, req);
+                       return NULL;
+               }
                 req->rq_state |= RQ_IN_ACT_LOG;
-               drbd_al_begin_io(mdev, &req->i);
         }
  
+       return req;
+}
+
+static void drbd_send_and_submit(struct drbd_conf *mdev, struct drbd_request *req)
+{
+       const int rw = bio_rw(req->master_bio);
+       struct bio_and_error m = { NULL, };
+       bool no_remote = false;
+
         spin_lock_irq(&mdev->tconn->req_lock);
         if (rw == WRITE) {
                 /* This may temporarily give up the req_lock,
                  * but will re-aquire it before it returns here.
                  * Needs to be before the check on drbd_suspended() */
                 complete_conflicting_writes(req);
+               /* no more giving up req_lock from now on! */
+
+               /* check for congestion, and potentially stop sending
+                * full data updates, but start sending "dirty bits" only. */
+               maybe_pull_ahead(mdev);
         }
  
-       /* no more giving up req_lock from now on! */
  
         if (drbd_suspended(mdev)) {
                 /* push back and retry: */
@@ -1078,9 +1102,6 @@ void __drbd_make_request(struct drbd_conf *mdev, struct bio *bio, unsigned long
                 goto out;
         }
  
-       /* Update disk stats */
-       _drbd_start_io_acct(mdev, req, bio);
-
         /* We fail READ/READA early, if we can not serve it.
          * We must do this before req is registered on any lists.
          * Otherwise, drbd_req_complete() will queue failed READ for retry. */
@@ -1137,7 +1158,116 @@ out:
  
         if (m.bio)
                 complete_master_bio(mdev, &m);
-       return;
+}
+
+void __drbd_make_request(struct drbd_conf *mdev, struct bio *bio, unsigned long start_time)
+{
+       struct drbd_request *req = drbd_request_prepare(mdev, bio, start_time);
+       if (IS_ERR_OR_NULL(req))
+               return;
+       drbd_send_and_submit(mdev, req);
+}
+
+static void submit_fast_path(struct drbd_conf *mdev, struct list_head *incoming)
+{
+       struct drbd_request *req, *tmp;
+       list_for_each_entry_safe(req, tmp, incoming, tl_requests) {
+               const int rw = bio_data_dir(req->master_bio);
+
+               if (rw == WRITE /* rw != WRITE should not even end up here! */
+               && req->private_bio && req->i.size
+               && !test_bit(AL_SUSPENDED, &mdev->flags)) {
+                       if (!drbd_al_begin_io_fastpath(mdev, &req->i))
+                               continue;
+
+                       req->rq_state |= RQ_IN_ACT_LOG;
+               }
+
+               list_del_init(&req->tl_requests);
+               drbd_send_and_submit(mdev, req);
+       }
+}
+
+static bool prepare_al_transaction_nonblock(struct drbd_conf *mdev,
+                                           struct list_head *incoming,
+                                           struct list_head *pending)
+{
+       struct drbd_request *req, *tmp;
+       int wake = 0;
+       int err;
+
+       spin_lock_irq(&mdev->al_lock);
+       list_for_each_entry_safe(req, tmp, incoming, tl_requests) {
+               err = drbd_al_begin_io_nonblock(mdev, &req->i);
+               if (err == -EBUSY)
+                       wake = 1;
+               if (err)
+                       continue;
+               req->rq_state |= RQ_IN_ACT_LOG;
+               list_move_tail(&req->tl_requests, pending);
+       }
+       spin_unlock_irq(&mdev->al_lock);
+       if (wake)
+               wake_up(&mdev->al_wait);
+
+       return !list_empty(pending);
+}
+
+void do_submit(struct work_struct *ws)
+{
+       struct drbd_conf *mdev = container_of(ws, struct drbd_conf, submit.worker);
+       LIST_HEAD(incoming);
+       LIST_HEAD(pending);
+       struct drbd_request *req, *tmp;
+
+       for (;;) {
+               spin_lock(&mdev->submit.lock);
+               list_splice_tail_init(&mdev->submit.writes, &incoming);
+               spin_unlock(&mdev->submit.lock);
+
+               submit_fast_path(mdev, &incoming);
+               if (list_empty(&incoming))
+                       break;
+
+               wait_event(mdev->al_wait, prepare_al_transaction_nonblock(mdev, &incoming, &pending));
+               /* Maybe more was queued, while we prepared the transaction?
+                * Try to stuff them into this transaction as well.
+                * Be strictly non-blocking here, no wait_event, we already
+                * have something to commit.
+                * Stop if we don't make any more progres.
+                */
+               for (;;) {
+                       LIST_HEAD(more_pending);
+                       LIST_HEAD(more_incoming);
+                       bool made_progress;
+
+                       /* It is ok to look outside the lock,
+                        * it's only an optimization anyways */
+                       if (list_empty(&mdev->submit.writes))
+                               break;
+
+                       spin_lock(&mdev->submit.lock);
+                       list_splice_tail_init(&mdev->submit.writes, &more_incoming);
+                       spin_unlock(&mdev->submit.lock);
+
+                       if (list_empty(&more_incoming))
+                               break;
+
+                       made_progress = prepare_al_transaction_nonblock(mdev, &more_incoming, &more_pending);
+
+                       list_splice_tail_init(&more_pending, &pending);
+                       list_splice_tail_init(&more_incoming, &incoming);
+
+                       if (!made_progress)
+                               break;
+               }
+               drbd_al_begin_io_commit(mdev, false);
+
+               list_for_each_entry_safe(req, tmp, &pending, tl_requests) {
+                       list_del_init(&req->tl_requests);
+                       drbd_send_and_submit(mdev, req);
+               }
+       }
  }
  
  void drbd_make_request(struct request_queue *q, struct bio *bio)
diff --git a/drivers/block/drbd/drbd_req.h b/drivers/block/drbd/drbd_req.h

index c08d22964d06784024481cd0005784306341b891..978cb1addc98845fb8ca49838cfb5ec2478170f7 100644 (file)
--- a/drivers/block/drbd/drbd_req.h
+++ b/drivers/block/drbd/drbd_req.h
@@ -88,6 +88,14 @@ enum drbd_req_event {
         QUEUE_FOR_NET_READ,
         QUEUE_FOR_SEND_OOS,
  
+       /* An empty flush is queued as P_BARRIER,
+        * which will cause it to complete "successfully",
+        * even if the local disk flush failed.
+        *
+        * Just like "real" requests, empty flushes (blkdev_issue_flush()) will
+        * only see an error if neither local nor remote data is reachable. */
+       QUEUE_AS_DRBD_BARRIER,
+
         SEND_CANCELED,
         SEND_FAILED,
         HANDED_OVER_TO_NETWORK,
diff --git a/drivers/block/drbd/drbd_state.c b/drivers/block/drbd/drbd_state.c

index 0fe220cfb9e9d31ce1dffac5e3365311076f66da..90c5be2b1d309bf041927891f2f8be24fc491bc1 100644 (file)
--- a/drivers/block/drbd/drbd_state.c
+++ b/drivers/block/drbd/drbd_state.c
@@ -570,6 +570,13 @@ is_valid_state(struct drbd_conf *mdev, union drbd_state ns)
                   mdev->tconn->agreed_pro_version < 88)
                 rv = SS_NOT_SUPPORTED;
  
+       else if (ns.role == R_PRIMARY && ns.disk < D_UP_TO_DATE && ns.pdsk < D_UP_TO_DATE)
+               rv = SS_NO_UP_TO_DATE_DISK;
+
+       else if ((ns.conn == C_STARTING_SYNC_S || ns.conn == C_STARTING_SYNC_T) &&
+                 ns.pdsk == D_UNKNOWN)
+               rv = SS_NEED_CONNECTION;
+
         else if (ns.conn >= C_CONNECTED && ns.pdsk == D_UNKNOWN)
                 rv = SS_CONNECTED_OUTDATES;
  
@@ -635,6 +642,10 @@ is_valid_soft_transition(union drbd_state os, union drbd_state ns, struct drbd_t
             && os.conn < C_WF_REPORT_PARAMS)
                 rv = SS_NEED_CONNECTION; /* No NetworkFailure -> SyncTarget etc... */
  
+       if (ns.conn == C_DISCONNECTING && ns.pdsk == D_OUTDATED &&
+           os.conn < C_CONNECTED && os.pdsk > D_OUTDATED)
+               rv = SS_OUTDATE_WO_CONN;
+
         return rv;
  }
  
@@ -1377,13 +1388,6 @@ static void after_state_ch(struct drbd_conf *mdev, union drbd_state os,
                         &drbd_bmio_set_n_write, &abw_start_sync,
                         "set_n_write from StartingSync", BM_LOCKED_TEST_ALLOWED);
  
-       /* We are invalidating our self... */
-       if (os.conn < C_CONNECTED && ns.conn < C_CONNECTED &&
-           os.disk > D_INCONSISTENT && ns.disk == D_INCONSISTENT)
-               /* other bitmap operation expected during this phase */
-               drbd_queue_bitmap_io(mdev, &drbd_bmio_set_n_write, NULL,
-                       "set_n_write from invalidate", BM_LOCKED_MASK);
-
         /* first half of local IO error, failure to attach,
          * or administrative detach */
         if (os.disk != D_FAILED && ns.disk == D_FAILED) {
@@ -1748,13 +1752,9 @@ _conn_rq_cond(struct drbd_tconn *tconn, union drbd_state mask, union drbd_state
         if (test_and_clear_bit(CONN_WD_ST_CHG_FAIL, &tconn->flags))
                 return SS_CW_FAILED_BY_PEER;
  
-       rv = tconn->cstate != C_WF_REPORT_PARAMS ? SS_CW_NO_NEED : SS_UNKNOWN_ERROR;
-
-       if (rv == SS_UNKNOWN_ERROR)
-               rv = conn_is_valid_transition(tconn, mask, val, 0);
-
-       if (rv == SS_SUCCESS)
-               rv = SS_UNKNOWN_ERROR; /* cont waiting, otherwise fail. */
+       rv = conn_is_valid_transition(tconn, mask, val, 0);
+       if (rv == SS_SUCCESS && tconn->cstate == C_WF_REPORT_PARAMS)
+               rv = SS_UNKNOWN_ERROR; /* continue waiting */
  
         return rv;
  }
diff --git a/drivers/block/drbd/drbd_strings.c b/drivers/block/drbd/drbd_strings.c

index 9a664bd27404b87336247d452e736650f7476deb..58e08ff2b2cebb2d2836b47af7a92c37ed17226c 100644 (file)
--- a/drivers/block/drbd/drbd_strings.c
+++ b/drivers/block/drbd/drbd_strings.c
@@ -89,6 +89,7 @@ static const char *drbd_state_sw_errors[] = {
         [-SS_LOWER_THAN_OUTDATED] = "Disk state is lower than outdated",
         [-SS_IN_TRANSIENT_STATE] = "In transient state, retry after next state change",
         [-SS_CONCURRENT_ST_CHG] = "Concurrent state changes detected and aborted",
+       [-SS_OUTDATE_WO_CONN] = "Need a connection for a graceful disconnect/outdate peer",
         [-SS_O_VOL_PEER_PRI] = "Other vol primary on peer not allowed by config",
  };
  
diff --git a/drivers/block/drbd/drbd_worker.c b/drivers/block/drbd/drbd_worker.c

index 424dc7bdf9b7f97500c2f6afa2ec4aa69ef3e1d0..891c0ecaa292c84998f7357b82e3a80cdc0f6484 100644 (file)
--- a/drivers/block/drbd/drbd_worker.c
+++ b/drivers/block/drbd/drbd_worker.c
@@ -89,7 +89,8 @@ void drbd_md_io_complete(struct bio *bio, int error)
         md_io->done = 1;
         wake_up(&mdev->misc_wait);
         bio_put(bio);
-       put_ldev(mdev);
+       if (mdev->ldev) /* special case: drbd_md_read() during drbd_adm_attach() */
+               put_ldev(mdev);
  }
  
  /* reads on behalf of the partner,
@@ -1410,7 +1411,7 @@ int w_restart_disk_io(struct drbd_work *w, int cancel)
         struct drbd_conf *mdev = w->mdev;
  
         if (bio_data_dir(req->master_bio) == WRITE && req->rq_state & RQ_IN_ACT_LOG)
-               drbd_al_begin_io(mdev, &req->i);
+               drbd_al_begin_io(mdev, &req->i, false);
  
         drbd_req_make_private_bio(req, req->master_bio);
         req->private_bio->bi_bdev = mdev->ldev->backing_bdev;
@@ -1425,7 +1426,7 @@ static int _drbd_may_sync_now(struct drbd_conf *mdev)
         int resync_after;
  
         while (1) {
-               if (!odev->ldev)
+               if (!odev->ldev || odev->state.disk == D_DISKLESS)
                         return 1;
                 rcu_read_lock();
                 resync_after = rcu_dereference(odev->ldev->disk_conf)->resync_after;
@@ -1433,7 +1434,7 @@ static int _drbd_may_sync_now(struct drbd_conf *mdev)
                 if (resync_after == -1)
                         return 1;
                 odev = minor_to_mdev(resync_after);
-               if (!expect(odev))
+               if (!odev)
                         return 1;
                 if ((odev->state.conn >= C_SYNC_SOURCE &&
                      odev->state.conn <= C_PAUSED_SYNC_T) ||
@@ -1515,7 +1516,7 @@ enum drbd_ret_code drbd_resync_after_valid(struct drbd_conf *mdev, int o_minor)
  
         if (o_minor == -1)
                 return NO_ERROR;
-       if (o_minor < -1 || minor_to_mdev(o_minor) == NULL)
+       if (o_minor < -1 || o_minor > MINORMASK)
                 return ERR_RESYNC_AFTER;
  
         /* check for loops */
@@ -1524,6 +1525,15 @@ enum drbd_ret_code drbd_resync_after_valid(struct drbd_conf *mdev, int o_minor)
                 if (odev == mdev)
                         return ERR_RESYNC_AFTER_CYCLE;
  
+               /* You are free to depend on diskless, non-existing,
+                * or not yet/no longer existing minors.
+                * We only reject dependency loops.
+                * We cannot follow the dependency chain beyond a detached or
+                * missing minor.
+                */
+               if (!odev || !odev->ldev || odev->state.disk == D_DISKLESS)
+                       return NO_ERROR;
+
                 rcu_read_lock();
                 resync_after = rcu_dereference(odev->ldev->disk_conf)->resync_after;
                 rcu_read_unlock();
@@ -1652,7 +1662,9 @@ void drbd_start_resync(struct drbd_conf *mdev, enum drbd_conns side)
         clear_bit(B_RS_H_DONE, &mdev->flags);
  
         write_lock_irq(&global_state_lock);
-       if (!get_ldev_if_state(mdev, D_NEGOTIATING)) {
+       /* Did some connection breakage or IO error race with us? */
+       if (mdev->state.conn < C_CONNECTED
+       || !get_ldev_if_state(mdev, D_NEGOTIATING)) {
                 write_unlock_irq(&global_state_lock);
                 mutex_unlock(mdev->state_mutex);
                 return;
diff --git a/drivers/block/floppy.c b/drivers/block/floppy.c

index 2ddd64a9ffdee43429653806be3262a80c372240..04ceb7e2fadd6ca075d20ecd844c39bf1da07ff3 100644 (file)
--- a/drivers/block/floppy.c
+++ b/drivers/block/floppy.c
@@ -3601,7 +3601,7 @@ static void __init config_types(void)
                 pr_cont("\n");
  }
  
-static int floppy_release(struct gendisk *disk, fmode_t mode)
+static void floppy_release(struct gendisk *disk, fmode_t mode)
  {
         int drive = (long)disk->private_data;
  
@@ -3615,8 +3615,6 @@ static int floppy_release(struct gendisk *disk, fmode_t mode)
                 opened_bdev[drive] = NULL;
         mutex_unlock(&open_lock);
         mutex_unlock(&floppy_mutex);
-
-       return 0;
  }
  
  /*
@@ -3777,7 +3775,6 @@ static int __floppy_read_block_0(struct block_device *bdev)
         bio_vec.bv_len = size;
         bio_vec.bv_offset = 0;
         bio.bi_vcnt = 1;
-       bio.bi_idx = 0;
         bio.bi_size = size;
         bio.bi_bdev = bdev;
         bio.bi_sector = 0;
diff --git a/drivers/block/loop.c b/drivers/block/loop.c

index b2955b3f2cbc1bb43e19f1e379a5cc976730216d..d92d50fd84b7d4ec59d5537eed5d9fd288005c51 100644 (file)
--- a/drivers/block/loop.c
+++ b/drivers/block/loop.c
@@ -1518,7 +1518,7 @@ out:
         return err;
  }
  
-static int lo_release(struct gendisk *disk, fmode_t mode)
+static void lo_release(struct gendisk *disk, fmode_t mode)
  {
         struct loop_device *lo = disk->private_data;
         int err;
@@ -1535,7 +1535,7 @@ static int lo_release(struct gendisk *disk, fmode_t mode)
                  */
                 err = loop_clr_fd(lo);
                 if (!err)
-                       goto out_unlocked;
+                       return;
         } else {
                 /*
                  * Otherwise keep thread (if running) and config,
@@ -1546,8 +1546,6 @@ static int lo_release(struct gendisk *disk, fmode_t mode)
  
  out:
         mutex_unlock(&lo->lo_ctl_mutex);
-out_unlocked:
-       return 0;
  }
  
  static const struct block_device_operations lo_fops = {
diff --git a/drivers/block/mg_disk.c b/drivers/block/mg_disk.c

index 076ae7f1b781e0b1586631b2708471ce98f978c1..a56cfcd5d648c928401a56052243b61805ca7f4c 100644 (file)
--- a/drivers/block/mg_disk.c
+++ b/drivers/block/mg_disk.c
@@ -780,6 +780,7 @@ static const struct block_device_operations mg_disk_ops = {
         .getgeo = mg_getgeo
  };
  
+#ifdef CONFIG_PM_SLEEP
  static int mg_suspend(struct device *dev)
  {
         struct mg_drv_data *prv_data = dev->platform_data;
@@ -824,6 +825,7 @@ static int mg_resume(struct device *dev)
  
         return 0;
  }
+#endif
  
  static SIMPLE_DEV_PM_OPS(mg_pm, mg_suspend, mg_resume);
  
diff --git a/drivers/block/mtip32xx/mtip32xx.c b/drivers/block/mtip32xx/mtip32xx.c

index 32c678028e53c36ec8a52183799ad97642c666e8..847107ef0cce52c65c4ff1f1aeaab6a1b4540922 100644 (file)
--- a/drivers/block/mtip32xx/mtip32xx.c
+++ b/drivers/block/mtip32xx/mtip32xx.c
@@ -728,7 +728,10 @@ static void mtip_async_complete(struct mtip_port *port,
         atomic_set(&port->commands[tag].active, 0);
         release_slot(port, tag);
  
-       up(&port->cmd_slot);
+       if (unlikely(command->unaligned))
+               up(&port->cmd_slot_unal);
+       else
+               up(&port->cmd_slot);
  }
  
  /*
@@ -1560,10 +1563,12 @@ static int mtip_get_identify(struct mtip_port *port, void __user *user_buffer)
         }
  #endif
  
+#ifdef MTIP_TRIM /* Disabling TRIM support temporarily */
         /* Demux ID.DRAT & ID.RZAT to determine trim support */
         if (port->identify[69] & (1 << 14) && port->identify[69] & (1 << 5))
                 port->dd->trim_supp = true;
         else
+#endif
                 port->dd->trim_supp = false;
  
         /* Set the identify buffer as valid. */
@@ -2557,7 +2562,7 @@ static int mtip_hw_ioctl(struct driver_data *dd, unsigned int cmd,
   */
  static void mtip_hw_submit_io(struct driver_data *dd, sector_t sector,
                               int nsect, int nents, int tag, void *callback,
-                             void *data, int dir)
+                             void *data, int dir, int unaligned)
  {
         struct host_to_dev_fis  *fis;
         struct mtip_port *port = dd->port;
@@ -2570,6 +2575,7 @@ static void mtip_hw_submit_io(struct driver_data *dd, sector_t sector,
  
         command->scatter_ents = nents;
  
+       command->unaligned = unaligned;
         /*
          * The number of retries for this command before it is
          * reported as a failure to the upper layers.
@@ -2598,6 +2604,9 @@ static void mtip_hw_submit_io(struct driver_data *dd, sector_t sector,
         fis->res3        = 0;
         fill_command_sg(dd, command, nents);
  
+       if (unaligned)
+               fis->device |= 1 << 7;
+
         /* Populate the command header */
         command->command_header->opts =
                         __force_bit2int cpu_to_le32(
@@ -2644,9 +2653,13 @@ static void mtip_hw_submit_io(struct driver_data *dd, sector_t sector,
   * return value
   *      None
   */
-static void mtip_hw_release_scatterlist(struct driver_data *dd, int tag)
+static void mtip_hw_release_scatterlist(struct driver_data *dd, int tag,
+                                                               int unaligned)
  {
+       struct semaphore *sem = unaligned ? &dd->port->cmd_slot_unal :
+                                                       &dd->port->cmd_slot;
         release_slot(dd->port, tag);
+       up(sem);
  }
  
  /*
@@ -2661,22 +2674,25 @@ static void mtip_hw_release_scatterlist(struct driver_data *dd, int tag)
   *     or NULL if no command slots are available.
   */
  static struct scatterlist *mtip_hw_get_scatterlist(struct driver_data *dd,
-                                                  int *tag)
+                                                  int *tag, int unaligned)
  {
+       struct semaphore *sem = unaligned ? &dd->port->cmd_slot_unal :
+                                                       &dd->port->cmd_slot;
+
         /*
          * It is possible that, even with this semaphore, a thread
          * may think that no command slots are available. Therefore, we
          * need to make an attempt to get_slot().
          */
-       down(&dd->port->cmd_slot);
+       down(sem);
         *tag = get_slot(dd->port);
  
         if (unlikely(test_bit(MTIP_DDF_REMOVE_PENDING_BIT, &dd->dd_flag))) {
-               up(&dd->port->cmd_slot);
+               up(sem);
                 return NULL;
         }
         if (unlikely(*tag < 0)) {
-               up(&dd->port->cmd_slot);
+               up(sem);
                 return NULL;
         }
  
@@ -3010,6 +3026,11 @@ static inline void hba_setup(struct driver_data *dd)
                 dd->mmio + HOST_HSORG);
  }
  
+static int mtip_device_unaligned_constrained(struct driver_data *dd)
+{
+       return (dd->pdev->device == P420M_DEVICE_ID ? 1 : 0);
+}
+
  /*
   * Detect the details of the product, and store anything needed
   * into the driver data structure.  This includes product type and
@@ -3232,8 +3253,15 @@ static int mtip_hw_init(struct driver_data *dd)
         for (i = 0; i < MTIP_MAX_SLOT_GROUPS; i++)
                 dd->work[i].port = dd->port;
  
+       /* Enable unaligned IO constraints for some devices */
+       if (mtip_device_unaligned_constrained(dd))
+               dd->unal_qdepth = MTIP_MAX_UNALIGNED_SLOTS;
+       else
+               dd->unal_qdepth = 0;
+
         /* Counting semaphore to track command slot usage */
-       sema_init(&dd->port->cmd_slot, num_command_slots - 1);
+       sema_init(&dd->port->cmd_slot, num_command_slots - 1 - dd->unal_qdepth);
+       sema_init(&dd->port->cmd_slot_unal, dd->unal_qdepth);
  
         /* Spinlock to prevent concurrent issue */
         for (i = 0; i < MTIP_MAX_SLOT_GROUPS; i++)
@@ -3836,7 +3864,7 @@ static void mtip_make_request(struct request_queue *queue, struct bio *bio)
         struct scatterlist *sg;
         struct bio_vec *bvec;
         int nents = 0;
-       int tag = 0;
+       int tag = 0, unaligned = 0;
  
         if (unlikely(dd->dd_flag & MTIP_DDF_STOP_IO)) {
                 if (unlikely(test_bit(MTIP_DDF_REMOVE_PENDING_BIT,
@@ -3872,7 +3900,15 @@ static void mtip_make_request(struct request_queue *queue, struct bio *bio)
                 return;
         }
  
-       sg = mtip_hw_get_scatterlist(dd, &tag);
+       if (bio_data_dir(bio) == WRITE && bio_sectors(bio) <= 64 &&
+                                                       dd->unal_qdepth) {
+               if (bio->bi_sector % 8 != 0) /* Unaligned on 4k boundaries */
+                       unaligned = 1;
+               else if (bio_sectors(bio) % 8 != 0) /* Aligned but not 4k/8k */
+                       unaligned = 1;
+       }
+
+       sg = mtip_hw_get_scatterlist(dd, &tag, unaligned);
         if (likely(sg != NULL)) {
                 blk_queue_bounce(queue, &bio);
  
@@ -3880,7 +3916,7 @@ static void mtip_make_request(struct request_queue *queue, struct bio *bio)
                         dev_warn(&dd->pdev->dev,
                                 "Maximum number of SGL entries exceeded\n");
                         bio_io_error(bio);
-                       mtip_hw_release_scatterlist(dd, tag);
+                       mtip_hw_release_scatterlist(dd, tag, unaligned);
                         return;
                 }
  
@@ -3900,7 +3936,8 @@ static void mtip_make_request(struct request_queue *queue, struct bio *bio)
                                 tag,
                                 bio_endio,
                                 bio,
-                               bio_data_dir(bio));
+                               bio_data_dir(bio),
+                               unaligned);
         } else
                 bio_io_error(bio);
  }
@@ -4156,26 +4193,24 @@ static int mtip_block_remove(struct driver_data *dd)
   */
  static int mtip_block_shutdown(struct driver_data *dd)
  {
-       dev_info(&dd->pdev->dev,
-               "Shutting down %s ...\n", dd->disk->disk_name);
-
         /* Delete our gendisk structure, and cleanup the blk queue. */
         if (dd->disk) {
-               if (dd->disk->queue)
+               dev_info(&dd->pdev->dev,
+                       "Shutting down %s ...\n", dd->disk->disk_name);
+
+               if (dd->disk->queue) {
                         del_gendisk(dd->disk);
-               else
+                       blk_cleanup_queue(dd->queue);
+               } else
                         put_disk(dd->disk);
+               dd->disk  = NULL;
+               dd->queue = NULL;
         }
  
-
         spin_lock(&rssd_index_lock);
         ida_remove(&rssd_index_ida, dd->index);
         spin_unlock(&rssd_index_lock);
  
-       blk_cleanup_queue(dd->queue);
-       dd->disk  = NULL;
-       dd->queue = NULL;
-
         mtip_hw_shutdown(dd);
         return 0;
  }
diff --git a/drivers/block/mtip32xx/mtip32xx.h b/drivers/block/mtip32xx/mtip32xx.h

index 8e8334c9dd0f08139c9061bfced2111fe661c208..3bb8a295fbe4d96d51b429812748ebe293ebb98d 100644 (file)
--- a/drivers/block/mtip32xx/mtip32xx.h
+++ b/drivers/block/mtip32xx/mtip32xx.h
@@ -52,6 +52,9 @@
  #define MTIP_FTL_REBUILD_MAGIC         0xED51
  #define MTIP_FTL_REBUILD_TIMEOUT_MS    2400000
  
+/* unaligned IO handling */
+#define MTIP_MAX_UNALIGNED_SLOTS       8
+
  /* Macro to extract the tag bit number from a tag value. */
  #define MTIP_TAG_BIT(tag)      (tag & 0x1F)
  
@@ -333,6 +336,8 @@ struct mtip_cmd {
  
         int scatter_ents; /* Number of scatter list entries used */
  
+       int unaligned; /* command is unaligned on 4k boundary */
+
         struct scatterlist sg[MTIP_MAX_SG]; /* Scatter list entries */
  
         int retries; /* The number of retries left for this command. */
@@ -452,6 +457,10 @@ struct mtip_port {
          * command slots available.
          */
         struct semaphore cmd_slot;
+
+       /* Semaphore to control queue depth of unaligned IOs */
+       struct semaphore cmd_slot_unal;
+
         /* Spinlock for working around command-issue bug. */
         spinlock_t cmd_issue_lock[MTIP_MAX_SLOT_GROUPS];
  };
@@ -502,6 +511,8 @@ struct driver_data {
  
         int isr_binding;
  
+       int unal_qdepth; /* qdepth of unaligned IO queue */
+
         struct list_head online_list; /* linkage for online list */
  
         struct list_head remove_list; /* linkage for removing list */
diff --git a/drivers/block/paride/pcd.c b/drivers/block/paride/pcd.c

index ba2b6b5e591084f6ee6951cafb6d941130d6a229..e76bdc074dbe5678e52a33463a743a8b55335555 100644 (file)
--- a/drivers/block/paride/pcd.c
+++ b/drivers/block/paride/pcd.c
@@ -236,13 +236,12 @@ static int pcd_block_open(struct block_device *bdev, fmode_t mode)
         return ret;
  }
  
-static int pcd_block_release(struct gendisk *disk, fmode_t mode)
+static void pcd_block_release(struct gendisk *disk, fmode_t mode)
  {
         struct pcd_unit *cd = disk->private_data;
         mutex_lock(&pcd_mutex);
         cdrom_release(&cd->info, mode);
         mutex_unlock(&pcd_mutex);
-       return 0;
  }
  
  static int pcd_block_ioctl(struct block_device *bdev, fmode_t mode,
diff --git a/drivers/block/paride/pd.c b/drivers/block/paride/pd.c

index 831e3ac156e625517df61fb3a1255973ea565ca7..19ad8f0c83efe6942eb2dbabc95a978a41f05cd9 100644 (file)
--- a/drivers/block/paride/pd.c
+++ b/drivers/block/paride/pd.c
@@ -783,7 +783,7 @@ static int pd_ioctl(struct block_device *bdev, fmode_t mode,
         }
  }
  
-static int pd_release(struct gendisk *p, fmode_t mode)
+static void pd_release(struct gendisk *p, fmode_t mode)
  {
         struct pd_unit *disk = p->private_data;
  
@@ -791,8 +791,6 @@ static int pd_release(struct gendisk *p, fmode_t mode)
         if (!--disk->access && disk->removable)
                 pd_special_command(disk, pd_door_unlock);
         mutex_unlock(&pd_mutex);
-
-       return 0;
  }
  
  static unsigned int pd_check_events(struct gendisk *p, unsigned int clearing)
diff --git a/drivers/block/paride/pf.c b/drivers/block/paride/pf.c

index ec8f9ed6326eca34be8b837ce62aefecc1b4a25a..f5c86d523ba0c8fe3a6154737f010246d67540d0 100644 (file)
--- a/drivers/block/paride/pf.c
+++ b/drivers/block/paride/pf.c
@@ -211,7 +211,7 @@ static int pf_ioctl(struct block_device *bdev, fmode_t mode,
                     unsigned int cmd, unsigned long arg);
  static int pf_getgeo(struct block_device *bdev, struct hd_geometry *geo);
  
-static int pf_release(struct gendisk *disk, fmode_t mode);
+static void pf_release(struct gendisk *disk, fmode_t mode);
  
  static int pf_detect(void);
  static void do_pf_read(void);
@@ -360,14 +360,15 @@ static int pf_ioctl(struct block_device *bdev, fmode_t mode, unsigned int cmd, u
         return 0;
  }
  
-static int pf_release(struct gendisk *disk, fmode_t mode)
+static void pf_release(struct gendisk *disk, fmode_t mode)
  {
         struct pf_unit *pf = disk->private_data;
  
         mutex_lock(&pf_mutex);
         if (pf->access <= 0) {
                 mutex_unlock(&pf_mutex);
-               return -EINVAL;
+               WARN_ON(1);
+               return;
         }
  
         pf->access--;
@@ -376,8 +377,6 @@ static int pf_release(struct gendisk *disk, fmode_t mode)
                 pf_lock(pf, 0);
  
         mutex_unlock(&pf_mutex);
-       return 0;
-
  }
  
  static unsigned int pf_check_events(struct gendisk *disk, unsigned int clearing)
diff --git a/drivers/block/pktcdvd.c b/drivers/block/pktcdvd.c

index e0588c6dd86f824fdaa0bb99e8aadb5361e07312..3c08983e600a0a15e1380e9de1e6f50714fe3976 100644 (file)
--- a/drivers/block/pktcdvd.c
+++ b/drivers/block/pktcdvd.c
@@ -901,7 +901,7 @@ static void pkt_iosched_process_queue(struct pktcdvd_device *pd)
                         pd->iosched.successive_reads += bio->bi_size >> 10;
                 else {
                         pd->iosched.successive_reads = 0;
-                       pd->iosched.last_write = bio->bi_sector + bio_sectors(bio);
+                       pd->iosched.last_write = bio_end_sector(bio);
                 }
                 if (pd->iosched.successive_reads >= HI_SPEED_SWITCH) {
                         if (pd->read_speed == pd->write_speed) {
@@ -947,31 +947,6 @@ static int pkt_set_segment_merging(struct pktcdvd_device *pd, struct request_que
         }
  }
  
-/*
- * Copy CD_FRAMESIZE bytes from src_bio into a destination page
- */
-static void pkt_copy_bio_data(struct bio *src_bio, int seg, int offs, struct page *dst_page, int dst_offs)
-{
-       unsigned int copy_size = CD_FRAMESIZE;
-
-       while (copy_size > 0) {
-               struct bio_vec *src_bvl = bio_iovec_idx(src_bio, seg);
-               void *vfrom = kmap_atomic(src_bvl->bv_page) +
-                       src_bvl->bv_offset + offs;
-               void *vto = page_address(dst_page) + dst_offs;
-               int len = min_t(int, copy_size, src_bvl->bv_len - offs);
-
-               BUG_ON(len < 0);
-               memcpy(vto, vfrom, len);
-               kunmap_atomic(vfrom);
-
-               seg++;
-               offs = 0;
-               dst_offs += len;
-               copy_size -= len;
-       }
-}
-
  /*
   * Copy all data for this packet to pkt->pages[], so that
   * a) The number of required segments for the write bio is minimized, which
@@ -1181,16 +1156,15 @@ static int pkt_start_recovery(struct packet_data *pkt)
         new_sector = new_block * (CD_FRAMESIZE >> 9);
         pkt->sector = new_sector;
  
+       bio_reset(pkt->bio);
+       pkt->bio->bi_bdev = pd->bdev;
+       pkt->bio->bi_rw = REQ_WRITE;
         pkt->bio->bi_sector = new_sector;
-       pkt->bio->bi_next = NULL;
-       pkt->bio->bi_flags = 1 << BIO_UPTODATE;
-       pkt->bio->bi_idx = 0;
+       pkt->bio->bi_size = pkt->frames * CD_FRAMESIZE;
+       pkt->bio->bi_vcnt = pkt->frames;
  
-       BUG_ON(pkt->bio->bi_rw != REQ_WRITE);
-       BUG_ON(pkt->bio->bi_vcnt != pkt->frames);
-       BUG_ON(pkt->bio->bi_size != pkt->frames * CD_FRAMESIZE);
-       BUG_ON(pkt->bio->bi_end_io != pkt_end_io_packet_write);
-       BUG_ON(pkt->bio->bi_private != pkt);
+       pkt->bio->bi_end_io = pkt_end_io_packet_write;
+       pkt->bio->bi_private = pkt;
  
         drop_super(sb);
         return 1;
@@ -1325,55 +1299,35 @@ try_next_bio:
   */
  static void pkt_start_write(struct pktcdvd_device *pd, struct packet_data *pkt)
  {
-       struct bio *bio;
         int f;
-       int frames_write;
         struct bio_vec *bvec = pkt->w_bio->bi_io_vec;
  
+       bio_reset(pkt->w_bio);
+       pkt->w_bio->bi_sector = pkt->sector;
+       pkt->w_bio->bi_bdev = pd->bdev;
+       pkt->w_bio->bi_end_io = pkt_end_io_packet_write;
+       pkt->w_bio->bi_private = pkt;
+
+       /* XXX: locking? */
         for (f = 0; f < pkt->frames; f++) {
                 bvec[f].bv_page = pkt->pages[(f * CD_FRAMESIZE) / PAGE_SIZE];
                 bvec[f].bv_offset = (f * CD_FRAMESIZE) % PAGE_SIZE;
+               if (!bio_add_page(pkt->w_bio, bvec[f].bv_page, CD_FRAMESIZE, bvec[f].bv_offset))
+                       BUG();
         }
+       VPRINTK(DRIVER_NAME": vcnt=%d\n", pkt->w_bio->bi_vcnt);
  
         /*
          * Fill-in bvec with data from orig_bios.
          */
-       frames_write = 0;
         spin_lock(&pkt->lock);
-       bio_list_for_each(bio, &pkt->orig_bios) {
-               int segment = bio->bi_idx;
-               int src_offs = 0;
-               int first_frame = (bio->bi_sector - pkt->sector) / (CD_FRAMESIZE >> 9);
-               int num_frames = bio->bi_size / CD_FRAMESIZE;
-               BUG_ON(first_frame < 0);
-               BUG_ON(first_frame + num_frames > pkt->frames);
-               for (f = first_frame; f < first_frame + num_frames; f++) {
-                       struct bio_vec *src_bvl = bio_iovec_idx(bio, segment);
-
-                       while (src_offs >= src_bvl->bv_len) {
-                               src_offs -= src_bvl->bv_len;
-                               segment++;
-                               BUG_ON(segment >= bio->bi_vcnt);
-                               src_bvl = bio_iovec_idx(bio, segment);
-                       }
+       bio_copy_data(pkt->w_bio, pkt->orig_bios.head);
  
-                       if (src_bvl->bv_len - src_offs >= CD_FRAMESIZE) {
-                               bvec[f].bv_page = src_bvl->bv_page;
-                               bvec[f].bv_offset = src_bvl->bv_offset + src_offs;
-                       } else {
-                               pkt_copy_bio_data(bio, segment, src_offs,
-                                                 bvec[f].bv_page, bvec[f].bv_offset);
-                       }
-                       src_offs += CD_FRAMESIZE;
-                       frames_write++;
-               }
-       }
         pkt_set_state(pkt, PACKET_WRITE_WAIT_STATE);
         spin_unlock(&pkt->lock);
  
         VPRINTK("pkt_start_write: Writing %d frames for zone %llx\n",
-               frames_write, (unsigned long long)pkt->sector);
-       BUG_ON(frames_write != pkt->write_size);
+               pkt->write_size, (unsigned long long)pkt->sector);
  
         if (test_bit(PACKET_MERGE_SEGS, &pd->flags) || (pkt->write_size < pkt->frames)) {
                 pkt_make_local_copy(pkt, bvec);
@@ -1383,16 +1337,6 @@ static void pkt_start_write(struct pktcdvd_device *pd, struct packet_data *pkt)
         }
  
         /* Start the write request */
-       bio_reset(pkt->w_bio);
-       pkt->w_bio->bi_sector = pkt->sector;
-       pkt->w_bio->bi_bdev = pd->bdev;
-       pkt->w_bio->bi_end_io = pkt_end_io_packet_write;
-       pkt->w_bio->bi_private = pkt;
-       for (f = 0; f < pkt->frames; f++)
-               if (!bio_add_page(pkt->w_bio, bvec[f].bv_page, CD_FRAMESIZE, bvec[f].bv_offset))
-                       BUG();
-       VPRINTK(DRIVER_NAME": vcnt=%d\n", pkt->w_bio->bi_vcnt);
-
         atomic_set(&pkt->io_wait, 1);
         pkt->w_bio->bi_rw = WRITE;
         pkt_queue_bio(pd, pkt->w_bio);
@@ -2376,10 +2320,9 @@ out:
         return ret;
  }
  
-static int pkt_close(struct gendisk *disk, fmode_t mode)
+static void pkt_close(struct gendisk *disk, fmode_t mode)
  {
         struct pktcdvd_device *pd = disk->private_data;
-       int ret = 0;
  
         mutex_lock(&pktcdvd_mutex);
         mutex_lock(&ctl_mutex);
@@ -2391,7 +2334,6 @@ static int pkt_close(struct gendisk *disk, fmode_t mode)
         }
         mutex_unlock(&ctl_mutex);
         mutex_unlock(&pktcdvd_mutex);
-       return ret;
  }
  
  
@@ -2433,7 +2375,7 @@ static void pkt_make_request(struct request_queue *q, struct bio *bio)
                 cloned_bio->bi_bdev = pd->bdev;
                 cloned_bio->bi_private = psd;
                 cloned_bio->bi_end_io = pkt_end_io_read_cloned;
-               pd->stats.secs_r += bio->bi_size >> 9;
+               pd->stats.secs_r += bio_sectors(bio);
                 pkt_queue_bio(pd, cloned_bio);
                 return;
         }
@@ -2454,7 +2396,7 @@ static void pkt_make_request(struct request_queue *q, struct bio *bio)
         zone = ZONE(bio->bi_sector, pd);
         VPRINTK("pkt_make_request: start = %6llx stop = %6llx\n",
                 (unsigned long long)bio->bi_sector,
-               (unsigned long long)(bio->bi_sector + bio_sectors(bio)));
+               (unsigned long long)bio_end_sector(bio));
  
         /* Check if we have to split the bio */
         {
@@ -2462,7 +2404,7 @@ static void pkt_make_request(struct request_queue *q, struct bio *bio)
                 sector_t last_zone;
                 int first_sectors;
  
-               last_zone = ZONE(bio->bi_sector + bio_sectors(bio) - 1, pd);
+               last_zone = ZONE(bio_end_sector(bio) - 1, pd);
                 if (last_zone != zone) {
                         BUG_ON(last_zone != zone + pd->settings.size);
                         first_sectors = last_zone - bio->bi_sector;
diff --git a/drivers/block/rbd.c b/drivers/block/rbd.c

index c2ca1818f33583679ac3d96f8bbaa4f33949640a..ca63104136e0db46d0248aa290c355483989ec2e 100644 (file)
--- a/drivers/block/rbd.c
+++ b/drivers/block/rbd.c
@@ -460,7 +460,7 @@ static int rbd_open(struct block_device *bdev, fmode_t mode)
         return 0;
  }
  
-static int rbd_release(struct gendisk *disk, fmode_t mode)
+static void rbd_release(struct gendisk *disk, fmode_t mode)
  {
         struct rbd_device *rbd_dev = disk->private_data;
         unsigned long open_count_before;
@@ -473,8 +473,6 @@ static int rbd_release(struct gendisk *disk, fmode_t mode)
         mutex_lock_nested(&ctl_mutex, SINGLE_DEPTH_NESTING);
         put_device(&rbd_dev->dev);
         mutex_unlock(&ctl_mutex);
-
-       return 0;
  }
  
  static const struct block_device_operations rbd_bd_ops = {
@@ -1145,7 +1143,7 @@ static struct bio *bio_clone_range(struct bio *bio_src,
         /* Find first affected segment... */
  
         resid = offset;
-       __bio_for_each_segment(bv, bio_src, idx, 0) {
+       bio_for_each_segment(bv, bio_src, idx) {
                 if (resid < bv->bv_len)
                         break;
                 resid -= bv->bv_len;
diff --git a/drivers/block/swim.c b/drivers/block/swim.c

index 8766a2257091a46e9bb1ae39145900f015ae54ae..2f445b7a174e2c1dff1f9a2d1fb7438e9a242ff9 100644 (file)
--- a/drivers/block/swim.c
+++ b/drivers/block/swim.c
@@ -673,7 +673,7 @@ static int floppy_unlocked_open(struct block_device *bdev, fmode_t mode)
         return ret;
  }
  
-static int floppy_release(struct gendisk *disk, fmode_t mode)
+static void floppy_release(struct gendisk *disk, fmode_t mode)
  {
         struct floppy_state *fs = disk->private_data;
         struct swim __iomem *base = fs->swd->base;
@@ -687,8 +687,6 @@ static int floppy_release(struct gendisk *disk, fmode_t mode)
         if (fs->ref_count == 0)
                 swim_motor(base, OFF);
         mutex_unlock(&swim_mutex);
-
-       return 0;
  }
  
  static int floppy_ioctl(struct block_device *bdev, fmode_t mode,
diff --git a/drivers/block/swim3.c b/drivers/block/swim3.c

index 758f2ac878cfe5bfc8f5c891f5f0a48cc4b2e3fd..20e061c3e02329c0662b4ce8fac0ad2b82d9c8ce 100644 (file)
--- a/drivers/block/swim3.c
+++ b/drivers/block/swim3.c
@@ -251,7 +251,7 @@ static int fd_eject(struct floppy_state *fs);
  static int floppy_ioctl(struct block_device *bdev, fmode_t mode,
                         unsigned int cmd, unsigned long param);
  static int floppy_open(struct block_device *bdev, fmode_t mode);
-static int floppy_release(struct gendisk *disk, fmode_t mode);
+static void floppy_release(struct gendisk *disk, fmode_t mode);
  static unsigned int floppy_check_events(struct gendisk *disk,
                                         unsigned int clearing);
  static int floppy_revalidate(struct gendisk *disk);
@@ -1017,7 +1017,7 @@ static int floppy_unlocked_open(struct block_device *bdev, fmode_t mode)
         return ret;
  }
  
-static int floppy_release(struct gendisk *disk, fmode_t mode)
+static void floppy_release(struct gendisk *disk, fmode_t mode)
  {
         struct floppy_state *fs = disk->private_data;
         struct swim3 __iomem *sw = fs->swim3;
@@ -1029,7 +1029,6 @@ static int floppy_release(struct gendisk *disk, fmode_t mode)
                 swim3_select(fs, RELAX);
         }
         mutex_unlock(&swim3_mutex);
-       return 0;
  }
  
  static unsigned int floppy_check_events(struct gendisk *disk,
diff --git a/drivers/block/xen-blkfront.c b/drivers/block/xen-blkfront.c

index a894f88762d8d3a1f72315e805f9494cdad8294d..d89ef86220f4a55a247083d36d5f8dd3ce74e0d5 100644 (file)
--- a/drivers/block/xen-blkfront.c
+++ b/drivers/block/xen-blkfront.c
@@ -1617,7 +1617,7 @@ out:
         return err;
  }
  
-static int blkif_release(struct gendisk *disk, fmode_t mode)
+static void blkif_release(struct gendisk *disk, fmode_t mode)
  {
         struct blkfront_info *info = disk->private_data;
         struct block_device *bdev;
@@ -1658,7 +1658,6 @@ static int blkif_release(struct gendisk *disk, fmode_t mode)
  out:
         bdput(bdev);
         mutex_unlock(&blkfront_mutex);
-       return 0;
  }
  
  static const struct block_device_operations xlvbd_block_fops =
diff --git a/drivers/block/xsysace.c b/drivers/block/xsysace.c

index 1f38643173caa3968f6f1b53f778a2a4f24411fc..f8ef15f37c5ec67eb6a35151b756c7496a1a666d 100644 (file)
--- a/drivers/block/xsysace.c
+++ b/drivers/block/xsysace.c
@@ -915,7 +915,7 @@ static int ace_open(struct block_device *bdev, fmode_t mode)
         return 0;
  }
  
-static int ace_release(struct gendisk *disk, fmode_t mode)
+static void ace_release(struct gendisk *disk, fmode_t mode)
  {
         struct ace_device *ace = disk->private_data;
         unsigned long flags;
@@ -932,7 +932,6 @@ static int ace_release(struct gendisk *disk, fmode_t mode)
         }
         spin_unlock_irqrestore(&ace->lock, flags);
         mutex_unlock(&xsysace_mutex);
-       return 0;
  }
  
  static int ace_getgeo(struct block_device *bdev, struct hd_geometry *geo)
diff --git a/drivers/block/z2ram.c b/drivers/block/z2ram.c

index a22e3f895947e0193d7d1f0d8377ac715b142100..5a95baf4b104e4c2dc559534867effa53e6327a2 100644 (file)
--- a/drivers/block/z2ram.c
+++ b/drivers/block/z2ram.c
@@ -309,20 +309,18 @@ err_out:
      return rc;
  }
  
-static int
+static void
  z2_release(struct gendisk *disk, fmode_t mode)
  {
      mutex_lock(&z2ram_mutex);
      if ( current_device == -1 ) {
         mutex_unlock(&z2ram_mutex);
-       return 0;
+       return;
      }
      mutex_unlock(&z2ram_mutex);
      /*
       * FIXME: unmap memory
       */
-
-    return 0;
  }
  
  static const struct block_device_operations z2_fops =
diff --git a/drivers/cdrom/gdrom.c b/drivers/cdrom/gdrom.c

index d59cdcb8fe399b43c714370c1a5a3cb68cbed91d..4afcb65cc62397466aee5f8272051e197a9c9fb1 100644 (file)
--- a/drivers/cdrom/gdrom.c
+++ b/drivers/cdrom/gdrom.c
@@ -503,12 +503,11 @@ static int gdrom_bdops_open(struct block_device *bdev, fmode_t mode)
         return ret;
  }
  
-static int gdrom_bdops_release(struct gendisk *disk, fmode_t mode)
+static void gdrom_bdops_release(struct gendisk *disk, fmode_t mode)
  {
         mutex_lock(&gdrom_mutex);
         cdrom_release(gd.cd_info, mode);
         mutex_unlock(&gdrom_mutex);
-       return 0;
  }
  
  static unsigned int gdrom_bdops_check_events(struct gendisk *disk,
diff --git a/drivers/char/mem.c b/drivers/char/mem.c

index 2c644afbcdd4561d2fc7810aafd2ef9502ecf0e6..1ccbe9482faa5dbc3930478a17f493781849f161 100644 (file)
--- a/drivers/char/mem.c
+++ b/drivers/char/mem.c
@@ -28,6 +28,7 @@
  #include <linux/pfn.h>
  #include <linux/export.h>
  #include <linux/io.h>
+#include <linux/aio.h>
  
  #include <asm/uaccess.h>
  
@@ -627,6 +628,18 @@ static ssize_t write_null(struct file *file, const char __user *buf,
         return count;
  }
  
+static ssize_t aio_read_null(struct kiocb *iocb, const struct iovec *iov,
+                            unsigned long nr_segs, loff_t pos)
+{
+       return 0;
+}
+
+static ssize_t aio_write_null(struct kiocb *iocb, const struct iovec *iov,
+                             unsigned long nr_segs, loff_t pos)
+{
+       return iov_length(iov, nr_segs);
+}
+
  static int pipe_to_null(struct pipe_inode_info *info, struct pipe_buffer *buf,
                         struct splice_desc *sd)
  {
@@ -670,6 +683,24 @@ static ssize_t read_zero(struct file *file, char __user *buf,
         return written ? written : -EFAULT;
  }
  
+static ssize_t aio_read_zero(struct kiocb *iocb, const struct iovec *iov,
+                            unsigned long nr_segs, loff_t pos)
+{
+       size_t written = 0;
+       unsigned long i;
+       ssize_t ret;
+
+       for (i = 0; i < nr_segs; i++) {
+               ret = read_zero(iocb->ki_filp, iov[i].iov_base, iov[i].iov_len,
+                               &pos);
+               if (ret < 0)
+                       break;
+               written += ret;
+       }
+
+       return written ? written : -EFAULT;
+}
+
  static int mmap_zero(struct file *file, struct vm_area_struct *vma)
  {
  #ifndef CONFIG_MMU
@@ -738,6 +769,7 @@ static int open_port(struct inode *inode, struct file *filp)
  #define full_lseek      null_lseek
  #define write_zero     write_null
  #define read_full       read_zero
+#define aio_write_zero aio_write_null
  #define open_mem       open_port
  #define open_kmem      open_mem
  #define open_oldmem    open_mem
@@ -766,6 +798,8 @@ static const struct file_operations null_fops = {
         .llseek         = null_lseek,
         .read           = read_null,
         .write          = write_null,
+       .aio_read       = aio_read_null,
+       .aio_write      = aio_write_null,
         .splice_write   = splice_write_null,
  };
  
@@ -782,6 +816,8 @@ static const struct file_operations zero_fops = {
         .llseek         = zero_lseek,
         .read           = read_zero,
         .write          = write_zero,
+       .aio_read       = aio_read_zero,
+       .aio_write      = aio_write_zero,
         .mmap           = mmap_zero,
  };
  
diff --git a/drivers/dma/Kconfig b/drivers/dma/Kconfig

index aeaea32bcfdacd5fed2eb7e53536169348791cc8..e9924898043adf2a437b8f2eb3b8f15329ac755f 100644 (file)
--- a/drivers/dma/Kconfig
+++ b/drivers/dma/Kconfig
@@ -63,8 +63,6 @@ config INTEL_IOATDMA
         depends on PCI && X86
         select DMA_ENGINE
         select DCA
-       select ASYNC_TX_DISABLE_PQ_VAL_DMA
-       select ASYNC_TX_DISABLE_XOR_VAL_DMA
         help
           Enable support for the Intel(R) I/OAT DMA engine present
           in recent Intel Xeon chipsets.
@@ -174,15 +172,7 @@ config TEGRA20_APB_DMA
           This DMA controller transfers data from memory to peripheral fifo
           or vice versa. It does not support memory to memory data transfer.
  
-
-
-config SH_DMAE
-       tristate "Renesas SuperH DMAC support"
-       depends on (SUPERH && SH_DMA) || (ARM && ARCH_SHMOBILE)
-       depends on !SH_DMA_API
-       select DMA_ENGINE
-       help
-         Enable support for the Renesas SuperH DMA controllers.
+source "drivers/dma/sh/Kconfig"
  
  config COH901318
         bool "ST-Ericsson COH901318 DMA support"
@@ -328,6 +318,10 @@ config DMA_ENGINE
  config DMA_VIRTUAL_CHANNELS
         tristate
  
+config DMA_ACPI
+       def_bool y
+       depends on ACPI
+
  config DMA_OF
         def_bool y
         depends on OF
diff --git a/drivers/dma/Makefile b/drivers/dma/Makefile

index 488e3ff85b522840ce1ba506916c534ff3116455..a2b0df591f958654549c6a15c60a08efb4691678 100644 (file)
--- a/drivers/dma/Makefile
+++ b/drivers/dma/Makefile
@@ -3,6 +3,7 @@ ccflags-$(CONFIG_DMADEVICES_VDEBUG) += -DVERBOSE_DEBUG
  
  obj-$(CONFIG_DMA_ENGINE) += dmaengine.o
  obj-$(CONFIG_DMA_VIRTUAL_CHANNELS) += virt-dma.o
+obj-$(CONFIG_DMA_ACPI) += acpi-dma.o
  obj-$(CONFIG_DMA_OF) += of-dma.o
  
  obj-$(CONFIG_NET_DMA) += iovlock.o
@@ -18,7 +19,7 @@ obj-$(CONFIG_DW_DMAC) += dw_dmac.o
  obj-$(CONFIG_AT_HDMAC) += at_hdmac.o
  obj-$(CONFIG_MX3_IPU) += ipu/
  obj-$(CONFIG_TXX9_DMAC) += txx9dmac.o
-obj-$(CONFIG_SH_DMAE) += sh/
+obj-$(CONFIG_SH_DMAE_BASE) += sh/
  obj-$(CONFIG_COH901318) += coh901318.o coh901318_lli.o
  obj-$(CONFIG_AMCC_PPC440SPE_ADMA) += ppc4xx/
  obj-$(CONFIG_IMX_SDMA) += imx-sdma.o
diff --git a/drivers/dma/acpi-dma.c b/drivers/dma/acpi-dma.c

new file mode 100644 (file)

index 0000000..ba6fc62
--- /dev/null
+++ b/drivers/dma/acpi-dma.c
@@ -0,0 +1,279 @@
+/*
+ * ACPI helpers for DMA request / controller
+ *
+ * Based on of-dma.c
+ *
+ * Copyright (C) 2013, Intel Corporation
+ * Author: Andy Shevchenko <andriy.shevchenko@linux.intel.com>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+
+#include <linux/device.h>
+#include <linux/module.h>
+#include <linux/list.h>
+#include <linux/mutex.h>
+#include <linux/slab.h>
+#include <linux/acpi.h>
+#include <linux/acpi_dma.h>
+
+static LIST_HEAD(acpi_dma_list);
+static DEFINE_MUTEX(acpi_dma_lock);
+
+/**
+ * acpi_dma_controller_register - Register a DMA controller to ACPI DMA helpers
+ * @dev:               struct device of DMA controller
+ * @acpi_dma_xlate:    translation function which converts a dma specifier
+ *                     into a dma_chan structure
+ * @data               pointer to controller specific data to be used by
+ *                     translation function
+ *
+ * Returns 0 on success or appropriate errno value on error.
+ *
+ * Allocated memory should be freed with appropriate acpi_dma_controller_free()
+ * call.
+ */
+int acpi_dma_controller_register(struct device *dev,
+               struct dma_chan *(*acpi_dma_xlate)
+               (struct acpi_dma_spec *, struct acpi_dma *),
+               void *data)
+{
+       struct acpi_device *adev;
+       struct acpi_dma *adma;
+
+       if (!dev || !acpi_dma_xlate)
+               return -EINVAL;
+
+       /* Check if the device was enumerated by ACPI */
+       if (!ACPI_HANDLE(dev))
+               return -EINVAL;
+
+       if (acpi_bus_get_device(ACPI_HANDLE(dev), &adev))
+               return -EINVAL;
+
+       adma = kzalloc(sizeof(*adma), GFP_KERNEL);
+       if (!adma)
+               return -ENOMEM;
+
+       adma->dev = dev;
+       adma->acpi_dma_xlate = acpi_dma_xlate;
+       adma->data = data;
+
+       /* Now queue acpi_dma controller structure in list */
+       mutex_lock(&acpi_dma_lock);
+       list_add_tail(&adma->dma_controllers, &acpi_dma_list);
+       mutex_unlock(&acpi_dma_lock);
+
+       return 0;
+}
+EXPORT_SYMBOL_GPL(acpi_dma_controller_register);
+
+/**
+ * acpi_dma_controller_free - Remove a DMA controller from ACPI DMA helpers list
+ * @dev:       struct device of DMA controller
+ *
+ * Memory allocated by acpi_dma_controller_register() is freed here.
+ */
+int acpi_dma_controller_free(struct device *dev)
+{
+       struct acpi_dma *adma;
+
+       if (!dev)
+               return -EINVAL;
+
+       mutex_lock(&acpi_dma_lock);
+
+       list_for_each_entry(adma, &acpi_dma_list, dma_controllers)
+               if (adma->dev == dev) {
+                       list_del(&adma->dma_controllers);
+                       mutex_unlock(&acpi_dma_lock);
+                       kfree(adma);
+                       return 0;
+               }
+
+       mutex_unlock(&acpi_dma_lock);
+       return -ENODEV;
+}
+EXPORT_SYMBOL_GPL(acpi_dma_controller_free);
+
+static void devm_acpi_dma_release(struct device *dev, void *res)
+{
+       acpi_dma_controller_free(dev);
+}
+
+/**
+ * devm_acpi_dma_controller_register - resource managed acpi_dma_controller_register()
+ * @dev:               device that is registering this DMA controller
+ * @acpi_dma_xlate:    translation function
+ * @data               pointer to controller specific data
+ *
+ * Managed acpi_dma_controller_register(). DMA controller registered by this
+ * function are automatically freed on driver detach. See
+ * acpi_dma_controller_register() for more information.
+ */
+int devm_acpi_dma_controller_register(struct device *dev,
+               struct dma_chan *(*acpi_dma_xlate)
+               (struct acpi_dma_spec *, struct acpi_dma *),
+               void *data)
+{
+       void *res;
+       int ret;
+
+       res = devres_alloc(devm_acpi_dma_release, 0, GFP_KERNEL);
+       if (!res)
+               return -ENOMEM;
+
+       ret = acpi_dma_controller_register(dev, acpi_dma_xlate, data);
+       if (ret) {
+               devres_free(res);
+               return ret;
+       }
+       devres_add(dev, res);
+       return 0;
+}
+EXPORT_SYMBOL_GPL(devm_acpi_dma_controller_register);
+
+/**
+ * devm_acpi_dma_controller_free - resource managed acpi_dma_controller_free()
+ *
+ * Unregister a DMA controller registered with
+ * devm_acpi_dma_controller_register(). Normally this function will not need to
+ * be called and the resource management code will ensure that the resource is
+ * freed.
+ */
+void devm_acpi_dma_controller_free(struct device *dev)
+{
+       WARN_ON(devres_destroy(dev, devm_acpi_dma_release, NULL, NULL));
+}
+EXPORT_SYMBOL_GPL(devm_acpi_dma_controller_free);
+
+struct acpi_dma_parser_data {
+       struct acpi_dma_spec dma_spec;
+       size_t index;
+       size_t n;
+};
+
+/**
+ * acpi_dma_parse_fixed_dma - Parse FixedDMA ACPI resources to a DMA specifier
+ * @res:       struct acpi_resource to get FixedDMA resources from
+ * @data:      pointer to a helper struct acpi_dma_parser_data
+ */
+static int acpi_dma_parse_fixed_dma(struct acpi_resource *res, void *data)
+{
+       struct acpi_dma_parser_data *pdata = data;
+
+       if (res->type == ACPI_RESOURCE_TYPE_FIXED_DMA) {
+               struct acpi_resource_fixed_dma *dma = &res->data.fixed_dma;
+
+               if (pdata->n++ == pdata->index) {
+                       pdata->dma_spec.chan_id = dma->channels;
+                       pdata->dma_spec.slave_id = dma->request_lines;
+               }
+       }
+
+       /* Tell the ACPI core to skip this resource */
+       return 1;
+}
+
+/**
+ * acpi_dma_request_slave_chan_by_index - Get the DMA slave channel
+ * @dev:       struct device to get DMA request from
+ * @index:     index of FixedDMA descriptor for @dev
+ *
+ * Returns pointer to appropriate dma channel on success or NULL on error.
+ */
+struct dma_chan *acpi_dma_request_slave_chan_by_index(struct device *dev,
+               size_t index)
+{
+       struct acpi_dma_parser_data pdata;
+       struct acpi_dma_spec *dma_spec = &pdata.dma_spec;
+       struct list_head resource_list;
+       struct acpi_device *adev;
+       struct acpi_dma *adma;
+       struct dma_chan *chan = NULL;
+
+       /* Check if the device was enumerated by ACPI */
+       if (!dev || !ACPI_HANDLE(dev))
+               return NULL;
+
+       if (acpi_bus_get_device(ACPI_HANDLE(dev), &adev))
+               return NULL;
+
+       memset(&pdata, 0, sizeof(pdata));
+       pdata.index = index;
+
+       /* Initial values for the request line and channel */
+       dma_spec->chan_id = -1;
+       dma_spec->slave_id = -1;
+
+       INIT_LIST_HEAD(&resource_list);
+       acpi_dev_get_resources(adev, &resource_list,
+                       acpi_dma_parse_fixed_dma, &pdata);
+       acpi_dev_free_resource_list(&resource_list);
+
+       if (dma_spec->slave_id < 0 || dma_spec->chan_id < 0)
+               return NULL;
+
+       mutex_lock(&acpi_dma_lock);
+
+       list_for_each_entry(adma, &acpi_dma_list, dma_controllers) {
+               dma_spec->dev = adma->dev;
+               chan = adma->acpi_dma_xlate(dma_spec, adma);
+               if (chan)
+                       break;
+       }
+
+       mutex_unlock(&acpi_dma_lock);
+       return chan;
+}
+EXPORT_SYMBOL_GPL(acpi_dma_request_slave_chan_by_index);
+
+/**
+ * acpi_dma_request_slave_chan_by_name - Get the DMA slave channel
+ * @dev:       struct device to get DMA request from
+ * @name:      represents corresponding FixedDMA descriptor for @dev
+ *
+ * In order to support both Device Tree and ACPI in a single driver we
+ * translate the names "tx" and "rx" here based on the most common case where
+ * the first FixedDMA descriptor is TX and second is RX.
+ *
+ * Returns pointer to appropriate dma channel on success or NULL on error.
+ */
+struct dma_chan *acpi_dma_request_slave_chan_by_name(struct device *dev,
+               const char *name)
+{
+       size_t index;
+
+       if (!strcmp(name, "tx"))
+               index = 0;
+       else if (!strcmp(name, "rx"))
+               index = 1;
+       else
+               return NULL;
+
+       return acpi_dma_request_slave_chan_by_index(dev, index);
+}
+EXPORT_SYMBOL_GPL(acpi_dma_request_slave_chan_by_name);
+
+/**
+ * acpi_dma_simple_xlate - Simple ACPI DMA engine translation helper
+ * @dma_spec: pointer to ACPI DMA specifier
+ * @adma: pointer to ACPI DMA controller data
+ *
+ * A simple translation function for ACPI based devices. Passes &struct
+ * dma_spec to the DMA controller driver provided filter function. Returns
+ * pointer to the channel if found or %NULL otherwise.
+ */
+struct dma_chan *acpi_dma_simple_xlate(struct acpi_dma_spec *dma_spec,
+               struct acpi_dma *adma)
+{
+       struct acpi_dma_filter_info *info = adma->data;
+
+       if (!info || !info->filter_fn)
+               return NULL;
+
+       return dma_request_channel(info->dma_cap, info->filter_fn, dma_spec);
+}
+EXPORT_SYMBOL_GPL(acpi_dma_simple_xlate);
diff --git a/drivers/dma/at_hdmac.c b/drivers/dma/at_hdmac.c

index 88cfc61329d20fb137a46ff852569bf8b567c60a..e923cda930f98a09c90fa73b868cfeb3b619b30d 100644 (file)
--- a/drivers/dma/at_hdmac.c
+++ b/drivers/dma/at_hdmac.c
@@ -24,6 +24,7 @@
  #include <linux/slab.h>
  #include <linux/of.h>
  #include <linux/of_device.h>
+#include <linux/of_dma.h>
  
  #include "at_hdmac_regs.h"
  #include "dmaengine.h"
@@ -677,7 +678,7 @@ atc_prep_slave_sg(struct dma_chan *chan, struct scatterlist *sgl,
                 ctrlb |=  ATC_DST_ADDR_MODE_FIXED
                         | ATC_SRC_ADDR_MODE_INCR
                         | ATC_FC_MEM2PER
-                       | ATC_SIF(AT_DMA_MEM_IF) | ATC_DIF(AT_DMA_PER_IF);
+                       | ATC_SIF(atchan->mem_if) | ATC_DIF(atchan->per_if);
                 reg = sconfig->dst_addr;
                 for_each_sg(sgl, sg, sg_len, i) {
                         struct at_desc  *desc;
@@ -716,7 +717,7 @@ atc_prep_slave_sg(struct dma_chan *chan, struct scatterlist *sgl,
                 ctrlb |=  ATC_DST_ADDR_MODE_INCR
                         | ATC_SRC_ADDR_MODE_FIXED
                         | ATC_FC_PER2MEM
-                       | ATC_SIF(AT_DMA_PER_IF) | ATC_DIF(AT_DMA_MEM_IF);
+                       | ATC_SIF(atchan->per_if) | ATC_DIF(atchan->mem_if);
  
                 reg = sconfig->src_addr;
                 for_each_sg(sgl, sg, sg_len, i) {
@@ -822,8 +823,8 @@ atc_dma_cyclic_fill_desc(struct dma_chan *chan, struct at_desc *desc,
                 desc->lli.ctrlb = ATC_DST_ADDR_MODE_FIXED
                                 | ATC_SRC_ADDR_MODE_INCR
                                 | ATC_FC_MEM2PER
-                               | ATC_SIF(AT_DMA_MEM_IF)
-                               | ATC_DIF(AT_DMA_PER_IF);
+                               | ATC_SIF(atchan->mem_if)
+                               | ATC_DIF(atchan->per_if);
                 break;
  
         case DMA_DEV_TO_MEM:
@@ -833,8 +834,8 @@ atc_dma_cyclic_fill_desc(struct dma_chan *chan, struct at_desc *desc,
                 desc->lli.ctrlb = ATC_DST_ADDR_MODE_INCR
                                 | ATC_SRC_ADDR_MODE_FIXED
                                 | ATC_FC_PER2MEM
-                               | ATC_SIF(AT_DMA_PER_IF)
-                               | ATC_DIF(AT_DMA_MEM_IF);
+                               | ATC_SIF(atchan->per_if)
+                               | ATC_DIF(atchan->mem_if);
                 break;
  
         default:
@@ -1188,6 +1189,67 @@ static void atc_free_chan_resources(struct dma_chan *chan)
         dev_vdbg(chan2dev(chan), "free_chan_resources: done\n");
  }
  
+#ifdef CONFIG_OF
+static bool at_dma_filter(struct dma_chan *chan, void *slave)
+{
+       struct at_dma_slave *atslave = slave;
+
+       if (atslave->dma_dev == chan->device->dev) {
+               chan->private = atslave;
+               return true;
+       } else {
+               return false;
+       }
+}
+
+static struct dma_chan *at_dma_xlate(struct of_phandle_args *dma_spec,
+                                    struct of_dma *of_dma)
+{
+       struct dma_chan *chan;
+       struct at_dma_chan *atchan;
+       struct at_dma_slave *atslave;
+       dma_cap_mask_t mask;
+       unsigned int per_id;
+       struct platform_device *dmac_pdev;
+
+       if (dma_spec->args_count != 2)
+               return NULL;
+
+       dmac_pdev = of_find_device_by_node(dma_spec->np);
+
+       dma_cap_zero(mask);
+       dma_cap_set(DMA_SLAVE, mask);
+
+       atslave = devm_kzalloc(&dmac_pdev->dev, sizeof(*atslave), GFP_KERNEL);
+       if (!atslave)
+               return NULL;
+       /*
+        * We can fill both SRC_PER and DST_PER, one of these fields will be
+        * ignored depending on DMA transfer direction.
+        */
+       per_id = dma_spec->args[1];
+       atslave->cfg = ATC_FIFOCFG_HALFFIFO | ATC_DST_H2SEL_HW
+                     | ATC_SRC_H2SEL_HW | ATC_DST_PER(per_id)
+                     | ATC_SRC_PER(per_id);
+       atslave->dma_dev = &dmac_pdev->dev;
+
+       chan = dma_request_channel(mask, at_dma_filter, atslave);
+       if (!chan)
+               return NULL;
+
+       atchan = to_at_dma_chan(chan);
+       atchan->per_if = dma_spec->args[0] & 0xff;
+       atchan->mem_if = (dma_spec->args[0] >> 16) & 0xff;
+
+       return chan;
+}
+#else
+static struct dma_chan *at_dma_xlate(struct of_phandle_args *dma_spec,
+                                    struct of_dma *of_dma)
+{
+       return NULL;
+}
+#endif
  
  /*--  Module Management  -----------------------------------------------*/
  
@@ -1342,6 +1404,8 @@ static int __init at_dma_probe(struct platform_device *pdev)
         for (i = 0; i < plat_dat->nr_channels; i++) {
                 struct at_dma_chan      *atchan = &atdma->chan[i];
  
+               atchan->mem_if = AT_DMA_MEM_IF;
+               atchan->per_if = AT_DMA_PER_IF;
                 atchan->chan_common.device = &atdma->dma_common;
                 dma_cookie_init(&atchan->chan_common);
                 list_add_tail(&atchan->chan_common.device_node,
@@ -1388,8 +1452,25 @@ static int __init at_dma_probe(struct platform_device *pdev)
  
         dma_async_device_register(&atdma->dma_common);
  
+       /*
+        * Do not return an error if the dmac node is not present in order to
+        * not break the existing way of requesting channel with
+        * dma_request_channel().
+        */
+       if (pdev->dev.of_node) {
+               err = of_dma_controller_register(pdev->dev.of_node,
+                                                at_dma_xlate, atdma);
+               if (err) {
+                       dev_err(&pdev->dev, "could not register of_dma_controller\n");
+                       goto err_of_dma_controller_register;
+               }
+       }
+
         return 0;
  
+err_of_dma_controller_register:
+       dma_async_device_unregister(&atdma->dma_common);
+       dma_pool_destroy(atdma->dma_desc_pool);
  err_pool_create:
         platform_set_drvdata(pdev, NULL);
         free_irq(platform_get_irq(pdev, 0), atdma);
@@ -1406,7 +1487,7 @@ err_kfree:
         return err;
  }
  
-static int __exit at_dma_remove(struct platform_device *pdev)
+static int at_dma_remove(struct platform_device *pdev)
  {
         struct at_dma           *atdma = platform_get_drvdata(pdev);
         struct dma_chan         *chan, *_chan;
@@ -1564,7 +1645,7 @@ static const struct dev_pm_ops at_dma_dev_pm_ops = {
  };
  
  static struct platform_driver at_dma_driver = {
-       .remove         = __exit_p(at_dma_remove),
+       .remove         = at_dma_remove,
         .shutdown       = at_dma_shutdown,
         .id_table       = atdma_devtypes,
         .driver = {
diff --git a/drivers/dma/at_hdmac_regs.h b/drivers/dma/at_hdmac_regs.h

index 0eb3c1388667dc4de3a6d2e5284ba493b3d388bd..c604d26fd4d38cf48b437157a47450c49a076719 100644 (file)
--- a/drivers/dma/at_hdmac_regs.h
+++ b/drivers/dma/at_hdmac_regs.h
@@ -220,6 +220,8 @@ enum atc_status {
   * @device: parent device
   * @ch_regs: memory mapped register base
   * @mask: channel index in a mask
+ * @per_if: peripheral interface
+ * @mem_if: memory interface
   * @status: transmit status information from irq/prep* functions
   *                to tasklet (use atomic operations)
   * @tasklet: bottom half to finish transaction work
@@ -238,6 +240,8 @@ struct at_dma_chan {
         struct at_dma           *device;
         void __iomem            *ch_regs;
         u8                      mask;
+       u8                      per_if;
+       u8                      mem_if;
         unsigned long           status;
         struct tasklet_struct   tasklet;
         u32                     save_cfg;
diff --git a/drivers/dma/coh901318.c b/drivers/dma/coh901318.c

index 797940e532ff71dbf9a53000f4bfb65017b79433..3b23061cdb41bd0e963a5160bb4fbd7edf62de2d 100644 (file)
--- a/drivers/dma/coh901318.c
+++ b/drivers/dma/coh901318.c
@@ -2748,7 +2748,7 @@ static int __init coh901318_probe(struct platform_device *pdev)
         return err;
  }
  
-static int __exit coh901318_remove(struct platform_device *pdev)
+static int coh901318_remove(struct platform_device *pdev)
  {
         struct coh901318_base *base = platform_get_drvdata(pdev);
  
@@ -2760,7 +2760,7 @@ static int __exit coh901318_remove(struct platform_device *pdev)
  
  
  static struct platform_driver coh901318_driver = {
-       .remove = __exit_p(coh901318_remove),
+       .remove = coh901318_remove,
         .driver = {
                 .name   = "coh901318",
         },
diff --git a/drivers/dma/dmaengine.c b/drivers/dma/dmaengine.c

index b2728d6ba2fdea97774d904bcd04f58d12187772..93f7992bee5c1c933e49b7f93ee41c81265698c8 100644 (file)
--- a/drivers/dma/dmaengine.c
+++ b/drivers/dma/dmaengine.c
@@ -62,6 +62,8 @@
  #include <linux/rculist.h>
  #include <linux/idr.h>
  #include <linux/slab.h>
+#include <linux/acpi.h>
+#include <linux/acpi_dma.h>
  #include <linux/of_dma.h>
  
  static DEFINE_MUTEX(dma_list_mutex);
@@ -174,7 +176,8 @@ static struct class dma_devclass = {
  #define dma_device_satisfies_mask(device, mask) \
         __dma_device_satisfies_mask((device), &(mask))
  static int
-__dma_device_satisfies_mask(struct dma_device *device, dma_cap_mask_t *want)
+__dma_device_satisfies_mask(struct dma_device *device,
+                           const dma_cap_mask_t *want)
  {
         dma_cap_mask_t has;
  
@@ -463,7 +466,8 @@ static void dma_channel_rebalance(void)
                 }
  }
  
-static struct dma_chan *private_candidate(dma_cap_mask_t *mask, struct dma_device *dev,
+static struct dma_chan *private_candidate(const dma_cap_mask_t *mask,
+                                         struct dma_device *dev,
                                           dma_filter_fn fn, void *fn_param)
  {
         struct dma_chan *chan;
@@ -505,7 +509,8 @@ static struct dma_chan *private_candidate(dma_cap_mask_t *mask, struct dma_devic
   * @fn: optional callback to disposition available channels
   * @fn_param: opaque parameter to pass to dma_filter_fn
   */
-struct dma_chan *__dma_request_channel(dma_cap_mask_t *mask, dma_filter_fn fn, void *fn_param)
+struct dma_chan *__dma_request_channel(const dma_cap_mask_t *mask,
+                                      dma_filter_fn fn, void *fn_param)
  {
         struct dma_device *device, *_d;
         struct dma_chan *chan = NULL;
@@ -555,12 +560,16 @@ EXPORT_SYMBOL_GPL(__dma_request_channel);
   * @dev:       pointer to client device structure
   * @name:      slave channel name
   */
-struct dma_chan *dma_request_slave_channel(struct device *dev, char *name)
+struct dma_chan *dma_request_slave_channel(struct device *dev, const char *name)
  {
         /* If device-tree is present get slave info from here */
         if (dev->of_node)
                 return of_dma_request_slave_channel(dev->of_node, name);
  
+       /* If device was enumerated by ACPI get slave info from here */
+       if (ACPI_HANDLE(dev))
+               return acpi_dma_request_slave_chan_by_name(dev, name);
+
         return NULL;
  }
  EXPORT_SYMBOL_GPL(dma_request_slave_channel);
diff --git a/drivers/dma/dmatest.c b/drivers/dma/dmatest.c

index a2c8904b63ea44fd9154631170c391bd601054d2..d8ce4ecfef18e079336654b8a53ea89fd613cb44 100644 (file)
--- a/drivers/dma/dmatest.c
+++ b/drivers/dma/dmatest.c
@@ -2,6 +2,7 @@
   * DMA Engine test module
   *
   * Copyright (C) 2007 Atmel Corporation
+ * Copyright (C) 2013 Intel Corporation
   *
   * This program is free software; you can redistribute it and/or modify
   * it under the terms of the GNU General Public License version 2 as
@@ -18,6 +19,10 @@
  #include <linux/random.h>
  #include <linux/slab.h>
  #include <linux/wait.h>
+#include <linux/ctype.h>
+#include <linux/debugfs.h>
+#include <linux/uaccess.h>
+#include <linux/seq_file.h>
  
  static unsigned int test_buf_size = 16384;
  module_param(test_buf_size, uint, S_IRUGO);
@@ -61,6 +66,9 @@ module_param(timeout, uint, S_IRUGO);
  MODULE_PARM_DESC(timeout, "Transfer Timeout in msec (default: 3000), "
                  "Pass -1 for infinite timeout");
  
+/* Maximum amount of mismatched bytes in buffer to print */
+#define MAX_ERROR_COUNT                32
+
  /*
   * Initialization patterns. All bytes in the source buffer has bit 7
   * set, all bytes in the destination buffer has bit 7 cleared.
@@ -78,13 +86,65 @@ MODULE_PARM_DESC(timeout, "Transfer Timeout in msec (default: 3000), "
  #define PATTERN_OVERWRITE      0x20
  #define PATTERN_COUNT_MASK     0x1f
  
+enum dmatest_error_type {
+       DMATEST_ET_OK,
+       DMATEST_ET_MAP_SRC,
+       DMATEST_ET_MAP_DST,
+       DMATEST_ET_PREP,
+       DMATEST_ET_SUBMIT,
+       DMATEST_ET_TIMEOUT,
+       DMATEST_ET_DMA_ERROR,
+       DMATEST_ET_DMA_IN_PROGRESS,
+       DMATEST_ET_VERIFY,
+       DMATEST_ET_VERIFY_BUF,
+};
+
+struct dmatest_verify_buffer {
+       unsigned int    index;
+       u8              expected;
+       u8              actual;
+};
+
+struct dmatest_verify_result {
+       unsigned int                    error_count;
+       struct dmatest_verify_buffer    data[MAX_ERROR_COUNT];
+       u8                              pattern;
+       bool                            is_srcbuf;
+};
+
+struct dmatest_thread_result {
+       struct list_head        node;
+       unsigned int            n;
+       unsigned int            src_off;
+       unsigned int            dst_off;
+       unsigned int            len;
+       enum dmatest_error_type type;
+       union {
+               unsigned long                   data;
+               dma_cookie_t                    cookie;
+               enum dma_status                 status;
+               int                             error;
+               struct dmatest_verify_result    *vr;
+       };
+};
+
+struct dmatest_result {
+       struct list_head        node;
+       char                    *name;
+       struct list_head        results;
+};
+
+struct dmatest_info;
+
  struct dmatest_thread {
         struct list_head        node;
+       struct dmatest_info     *info;
         struct task_struct      *task;
         struct dma_chan         *chan;
         u8                      **srcs;
         u8                      **dsts;
         enum dma_transaction_type type;
+       bool                    done;
  };
  
  struct dmatest_chan {
@@ -93,25 +153,69 @@ struct dmatest_chan {
         struct list_head        threads;
  };
  
-/*
- * These are protected by dma_list_mutex since they're only used by
- * the DMA filter function callback
+/**
+ * struct dmatest_params - test parameters.
+ * @buf_size:          size of the memcpy test buffer
+ * @channel:           bus ID of the channel to test
+ * @device:            bus ID of the DMA Engine to test
+ * @threads_per_chan:  number of threads to start per channel
+ * @max_channels:      maximum number of channels to use
+ * @iterations:                iterations before stopping test
+ * @xor_sources:       number of xor source buffers
+ * @pq_sources:                number of p+q source buffers
+ * @timeout:           transfer timeout in msec, -1 for infinite timeout
   */
-static LIST_HEAD(dmatest_channels);
-static unsigned int nr_channels;
+struct dmatest_params {
+       unsigned int    buf_size;
+       char            channel[20];
+       char            device[20];
+       unsigned int    threads_per_chan;
+       unsigned int    max_channels;
+       unsigned int    iterations;
+       unsigned int    xor_sources;
+       unsigned int    pq_sources;
+       int             timeout;
+};
  
-static bool dmatest_match_channel(struct dma_chan *chan)
+/**
+ * struct dmatest_info - test information.
+ * @params:            test parameters
+ * @lock:              access protection to the fields of this structure
+ */
+struct dmatest_info {
+       /* Test parameters */
+       struct dmatest_params   params;
+
+       /* Internal state */
+       struct list_head        channels;
+       unsigned int            nr_channels;
+       struct mutex            lock;
+
+       /* debugfs related stuff */
+       struct dentry           *root;
+       struct dmatest_params   dbgfs_params;
+
+       /* Test results */
+       struct list_head        results;
+       struct mutex            results_lock;
+};
+
+static struct dmatest_info test_info;
+
+static bool dmatest_match_channel(struct dmatest_params *params,
+               struct dma_chan *chan)
  {
-       if (test_channel[0] == '\0')
+       if (params->channel[0] == '\0')
                 return true;
-       return strcmp(dma_chan_name(chan), test_channel) == 0;
+       return strcmp(dma_chan_name(chan), params->channel) == 0;
  }
  
-static bool dmatest_match_device(struct dma_device *device)
+static bool dmatest_match_device(struct dmatest_params *params,
+               struct dma_device *device)
  {
-       if (test_device[0] == '\0')
+       if (params->device[0] == '\0')
                 return true;
-       return strcmp(dev_name(device->dev), test_device) == 0;
+       return strcmp(dev_name(device->dev), params->device) == 0;
  }
  
  static unsigned long dmatest_random(void)
@@ -122,7 +226,8 @@ static unsigned long dmatest_random(void)
         return buf;
  }
  
-static void dmatest_init_srcs(u8 **bufs, unsigned int start, unsigned int len)
+static void dmatest_init_srcs(u8 **bufs, unsigned int start, unsigned int len,
+               unsigned int buf_size)
  {
         unsigned int i;
         u8 *buf;
@@ -133,13 +238,14 @@ static void dmatest_init_srcs(u8 **bufs, unsigned int start, unsigned int len)
                 for ( ; i < start + len; i++)
                         buf[i] = PATTERN_SRC | PATTERN_COPY
                                 | (~i & PATTERN_COUNT_MASK);
-               for ( ; i < test_buf_size; i++)
+               for ( ; i < buf_size; i++)
                         buf[i] = PATTERN_SRC | (~i & PATTERN_COUNT_MASK);
                 buf++;
         }
  }
  
-static void dmatest_init_dsts(u8 **bufs, unsigned int start, unsigned int len)
+static void dmatest_init_dsts(u8 **bufs, unsigned int start, unsigned int len,
+               unsigned int buf_size)
  {
         unsigned int i;
         u8 *buf;
@@ -150,40 +256,14 @@ static void dmatest_init_dsts(u8 **bufs, unsigned int start, unsigned int len)
                 for ( ; i < start + len; i++)
                         buf[i] = PATTERN_DST | PATTERN_OVERWRITE
                                 | (~i & PATTERN_COUNT_MASK);
-               for ( ; i < test_buf_size; i++)
+               for ( ; i < buf_size; i++)
                         buf[i] = PATTERN_DST | (~i & PATTERN_COUNT_MASK);
         }
  }
  
-static void dmatest_mismatch(u8 actual, u8 pattern, unsigned int index,
-               unsigned int counter, bool is_srcbuf)
-{
-       u8              diff = actual ^ pattern;
-       u8              expected = pattern | (~counter & PATTERN_COUNT_MASK);
-       const char      *thread_name = current->comm;
-
-       if (is_srcbuf)
-               pr_warning("%s: srcbuf[0x%x] overwritten!"
-                               " Expected %02x, got %02x\n",
-                               thread_name, index, expected, actual);
-       else if ((pattern & PATTERN_COPY)
-                       && (diff & (PATTERN_COPY | PATTERN_OVERWRITE)))
-               pr_warning("%s: dstbuf[0x%x] not copied!"
-                               " Expected %02x, got %02x\n",
-                               thread_name, index, expected, actual);
-       else if (diff & PATTERN_SRC)
-               pr_warning("%s: dstbuf[0x%x] was copied!"
-                               " Expected %02x, got %02x\n",
-                               thread_name, index, expected, actual);
-       else
-               pr_warning("%s: dstbuf[0x%x] mismatch!"
-                               " Expected %02x, got %02x\n",
-                               thread_name, index, expected, actual);
-}
-
-static unsigned int dmatest_verify(u8 **bufs, unsigned int start,
-               unsigned int end, unsigned int counter, u8 pattern,
-               bool is_srcbuf)
+static unsigned int dmatest_verify(struct dmatest_verify_result *vr, u8 **bufs,
+               unsigned int start, unsigned int end, unsigned int counter,
+               u8 pattern, bool is_srcbuf)
  {
         unsigned int i;
         unsigned int error_count = 0;
@@ -191,6 +271,7 @@ static unsigned int dmatest_verify(u8 **bufs, unsigned int start,
         u8 expected;
         u8 *buf;
         unsigned int counter_orig = counter;
+       struct dmatest_verify_buffer *vb;
  
         for (; (buf = *bufs); bufs++) {
                 counter = counter_orig;
@@ -198,18 +279,21 @@ static unsigned int dmatest_verify(u8 **bufs, unsigned int start,
                         actual = buf[i];
                         expected = pattern | (~counter & PATTERN_COUNT_MASK);
                         if (actual != expected) {
-                               if (error_count < 32)
-                                       dmatest_mismatch(actual, pattern, i,
-                                                        counter, is_srcbuf);
+                               if (error_count < MAX_ERROR_COUNT && vr) {
+                                       vb = &vr->data[error_count];
+                                       vb->index = i;
+                                       vb->expected = expected;
+                                       vb->actual = actual;
+                               }
                                 error_count++;
                         }
                         counter++;
                 }
         }
  
-       if (error_count > 32)
+       if (error_count > MAX_ERROR_COUNT)
                 pr_warning("%s: %u errors suppressed\n",
-                       current->comm, error_count - 32);
+                       current->comm, error_count - MAX_ERROR_COUNT);
  
         return error_count;
  }
@@ -249,6 +333,170 @@ static unsigned int min_odd(unsigned int x, unsigned int y)
         return val % 2 ? val : val - 1;
  }
  
+static char *verify_result_get_one(struct dmatest_verify_result *vr,
+               unsigned int i)
+{
+       struct dmatest_verify_buffer *vb = &vr->data[i];
+       u8 diff = vb->actual ^ vr->pattern;
+       static char buf[512];
+       char *msg;
+
+       if (vr->is_srcbuf)
+               msg = "srcbuf overwritten!";
+       else if ((vr->pattern & PATTERN_COPY)
+                       && (diff & (PATTERN_COPY | PATTERN_OVERWRITE)))
+               msg = "dstbuf not copied!";
+       else if (diff & PATTERN_SRC)
+               msg = "dstbuf was copied!";
+       else
+               msg = "dstbuf mismatch!";
+
+       snprintf(buf, sizeof(buf) - 1, "%s [0x%x] Expected %02x, got %02x", msg,
+                vb->index, vb->expected, vb->actual);
+
+       return buf;
+}
+
+static char *thread_result_get(const char *name,
+               struct dmatest_thread_result *tr)
+{
+       static const char * const messages[] = {
+               [DMATEST_ET_OK]                 = "No errors",
+               [DMATEST_ET_MAP_SRC]            = "src mapping error",
+               [DMATEST_ET_MAP_DST]            = "dst mapping error",
+               [DMATEST_ET_PREP]               = "prep error",
+               [DMATEST_ET_SUBMIT]             = "submit error",
+               [DMATEST_ET_TIMEOUT]            = "test timed out",
+               [DMATEST_ET_DMA_ERROR]          =
+                       "got completion callback (DMA_ERROR)",
+               [DMATEST_ET_DMA_IN_PROGRESS]    =
+                       "got completion callback (DMA_IN_PROGRESS)",
+               [DMATEST_ET_VERIFY]             = "errors",
+               [DMATEST_ET_VERIFY_BUF]         = "verify errors",
+       };
+       static char buf[512];
+
+       snprintf(buf, sizeof(buf) - 1,
+                "%s: #%u: %s with src_off=0x%x ""dst_off=0x%x len=0x%x (%lu)",
+                name, tr->n, messages[tr->type], tr->src_off, tr->dst_off,
+                tr->len, tr->data);
+
+       return buf;
+}
+
+static int thread_result_add(struct dmatest_info *info,
+               struct dmatest_result *r, enum dmatest_error_type type,
+               unsigned int n, unsigned int src_off, unsigned int dst_off,
+               unsigned int len, unsigned long data)
+{
+       struct dmatest_thread_result *tr;
+
+       tr = kzalloc(sizeof(*tr), GFP_KERNEL);
+       if (!tr)
+               return -ENOMEM;
+
+       tr->type = type;
+       tr->n = n;
+       tr->src_off = src_off;
+       tr->dst_off = dst_off;
+       tr->len = len;
+       tr->data = data;
+
+       mutex_lock(&info->results_lock);
+       list_add_tail(&tr->node, &r->results);
+       mutex_unlock(&info->results_lock);
+
+       pr_warn("%s\n", thread_result_get(r->name, tr));
+       return 0;
+}
+
+static unsigned int verify_result_add(struct dmatest_info *info,
+               struct dmatest_result *r, unsigned int n,
+               unsigned int src_off, unsigned int dst_off, unsigned int len,
+               u8 **bufs, int whence, unsigned int counter, u8 pattern,
+               bool is_srcbuf)
+{
+       struct dmatest_verify_result *vr;
+       unsigned int error_count;
+       unsigned int buf_off = is_srcbuf ? src_off : dst_off;
+       unsigned int start, end;
+
+       if (whence < 0) {
+               start = 0;
+               end = buf_off;
+       } else if (whence > 0) {
+               start = buf_off + len;
+               end = info->params.buf_size;
+       } else {
+               start = buf_off;
+               end = buf_off + len;
+       }
+
+       vr = kmalloc(sizeof(*vr), GFP_KERNEL);
+       if (!vr) {
+               pr_warn("dmatest: No memory to store verify result\n");
+               return dmatest_verify(NULL, bufs, start, end, counter, pattern,
+                                     is_srcbuf);
+       }
+
+       vr->pattern = pattern;
+       vr->is_srcbuf = is_srcbuf;
+
+       error_count = dmatest_verify(vr, bufs, start, end, counter, pattern,
+                                    is_srcbuf);
+       if (error_count) {
+               vr->error_count = error_count;
+               thread_result_add(info, r, DMATEST_ET_VERIFY_BUF, n, src_off,
+                                 dst_off, len, (unsigned long)vr);
+               return error_count;
+       }
+
+       kfree(vr);
+       return 0;
+}
+
+static void result_free(struct dmatest_info *info, const char *name)
+{
+       struct dmatest_result *r, *_r;
+
+       mutex_lock(&info->results_lock);
+       list_for_each_entry_safe(r, _r, &info->results, node) {
+               struct dmatest_thread_result *tr, *_tr;
+
+               if (name && strcmp(r->name, name))
+                       continue;
+
+               list_for_each_entry_safe(tr, _tr, &r->results, node) {
+                       if (tr->type == DMATEST_ET_VERIFY_BUF)
+                               kfree(tr->vr);
+                       list_del(&tr->node);
+                       kfree(tr);
+               }
+
+               kfree(r->name);
+               list_del(&r->node);
+               kfree(r);
+       }
+
+       mutex_unlock(&info->results_lock);
+}
+
+static struct dmatest_result *result_init(struct dmatest_info *info,
+               const char *name)
+{
+       struct dmatest_result *r;
+
+       r = kzalloc(sizeof(*r), GFP_KERNEL);
+       if (r) {
+               r->name = kstrdup(name, GFP_KERNEL);
+               INIT_LIST_HEAD(&r->results);
+               mutex_lock(&info->results_lock);
+               list_add_tail(&r->node, &info->results);
+               mutex_unlock(&info->results_lock);
+       }
+       return r;
+}
+
  /*
   * This function repeatedly tests DMA transfers of various lengths and
   * offsets for a given operation type until it is told to exit by
@@ -268,6 +516,8 @@ static int dmatest_func(void *data)
         DECLARE_WAIT_QUEUE_HEAD_ONSTACK(done_wait);
         struct dmatest_thread   *thread = data;
         struct dmatest_done     done = { .wait = &done_wait };
+       struct dmatest_info     *info;
+       struct dmatest_params   *params;
         struct dma_chan         *chan;
         struct dma_device       *dev;
         const char              *thread_name;
@@ -278,11 +528,12 @@ static int dmatest_func(void *data)
         dma_cookie_t            cookie;
         enum dma_status         status;
         enum dma_ctrl_flags     flags;
-       u8                      pq_coefs[pq_sources + 1];
+       u8                      *pq_coefs = NULL;
         int                     ret;
         int                     src_cnt;
         int                     dst_cnt;
         int                     i;
+       struct dmatest_result   *result;
  
         thread_name = current->comm;
         set_freezable();
@@ -290,28 +541,39 @@ static int dmatest_func(void *data)
         ret = -ENOMEM;
  
         smp_rmb();
+       info = thread->info;
+       params = &info->params;
         chan = thread->chan;
         dev = chan->device;
         if (thread->type == DMA_MEMCPY)
                 src_cnt = dst_cnt = 1;
         else if (thread->type == DMA_XOR) {
                 /* force odd to ensure dst = src */
-               src_cnt = min_odd(xor_sources | 1, dev->max_xor);
+               src_cnt = min_odd(params->xor_sources | 1, dev->max_xor);
                 dst_cnt = 1;
         } else if (thread->type == DMA_PQ) {
                 /* force odd to ensure dst = src */
-               src_cnt = min_odd(pq_sources | 1, dma_maxpq(dev, 0));
+               src_cnt = min_odd(params->pq_sources | 1, dma_maxpq(dev, 0));
                 dst_cnt = 2;
+
+               pq_coefs = kmalloc(params->pq_sources+1, GFP_KERNEL);
+               if (!pq_coefs)
+                       goto err_thread_type;
+
                 for (i = 0; i < src_cnt; i++)
                         pq_coefs[i] = 1;
         } else
+               goto err_thread_type;
+
+       result = result_init(info, thread_name);
+       if (!result)
                 goto err_srcs;
  
         thread->srcs = kcalloc(src_cnt+1, sizeof(u8 *), GFP_KERNEL);
         if (!thread->srcs)
                 goto err_srcs;
         for (i = 0; i < src_cnt; i++) {
-               thread->srcs[i] = kmalloc(test_buf_size, GFP_KERNEL);
+               thread->srcs[i] = kmalloc(params->buf_size, GFP_KERNEL);
                 if (!thread->srcs[i])
                         goto err_srcbuf;
         }
@@ -321,7 +583,7 @@ static int dmatest_func(void *data)
         if (!thread->dsts)
                 goto err_dsts;
         for (i = 0; i < dst_cnt; i++) {
-               thread->dsts[i] = kmalloc(test_buf_size, GFP_KERNEL);
+               thread->dsts[i] = kmalloc(params->buf_size, GFP_KERNEL);
                 if (!thread->dsts[i])
                         goto err_dstbuf;
         }
@@ -337,7 +599,7 @@ static int dmatest_func(void *data)
               | DMA_COMPL_SKIP_DEST_UNMAP | DMA_COMPL_SRC_UNMAP_SINGLE;
  
         while (!kthread_should_stop()
-              && !(iterations && total_tests >= iterations)) {
+              && !(params->iterations && total_tests >= params->iterations)) {
                 struct dma_async_tx_descriptor *tx = NULL;
                 dma_addr_t dma_srcs[src_cnt];
                 dma_addr_t dma_dsts[dst_cnt];
@@ -353,24 +615,24 @@ static int dmatest_func(void *data)
                 else if (thread->type == DMA_PQ)
                         align = dev->pq_align;
  
-               if (1 << align > test_buf_size) {
+               if (1 << align > params->buf_size) {
                         pr_err("%u-byte buffer too small for %d-byte alignment\n",
-                              test_buf_size, 1 << align);
+                              params->buf_size, 1 << align);
                         break;
                 }
  
-               len = dmatest_random() % test_buf_size + 1;
+               len = dmatest_random() % params->buf_size + 1;
                 len = (len >> align) << align;
                 if (!len)
                         len = 1 << align;
-               src_off = dmatest_random() % (test_buf_size - len + 1);
-               dst_off = dmatest_random() % (test_buf_size - len + 1);
+               src_off = dmatest_random() % (params->buf_size - len + 1);
+               dst_off = dmatest_random() % (params->buf_size - len + 1);
  
                 src_off = (src_off >> align) << align;
                 dst_off = (dst_off >> align) << align;
  
-               dmatest_init_srcs(thread->srcs, src_off, len);
-               dmatest_init_dsts(thread->dsts, dst_off, len);
+               dmatest_init_srcs(thread->srcs, src_off, len, params->buf_size);
+               dmatest_init_dsts(thread->dsts, dst_off, len, params->buf_size);
  
                 for (i = 0; i < src_cnt; i++) {
                         u8 *buf = thread->srcs[i] + src_off;
@@ -380,10 +642,10 @@ static int dmatest_func(void *data)
                         ret = dma_mapping_error(dev->dev, dma_srcs[i]);
                         if (ret) {
                                 unmap_src(dev->dev, dma_srcs, len, i);
-                               pr_warn("%s: #%u: mapping error %d with "
-                                       "src_off=0x%x len=0x%x\n",
-                                       thread_name, total_tests - 1, ret,
-                                       src_off, len);
+                               thread_result_add(info, result,
+                                                 DMATEST_ET_MAP_SRC,
+                                                 total_tests, src_off, dst_off,
+                                                 len, ret);
                                 failed_tests++;
                                 continue;
                         }
@@ -391,16 +653,17 @@ static int dmatest_func(void *data)
                 /* map with DMA_BIDIRECTIONAL to force writeback/invalidate */
                 for (i = 0; i < dst_cnt; i++) {
                         dma_dsts[i] = dma_map_single(dev->dev, thread->dsts[i],
-                                                    test_buf_size,
+                                                    params->buf_size,
                                                      DMA_BIDIRECTIONAL);
                         ret = dma_mapping_error(dev->dev, dma_dsts[i]);
                         if (ret) {
                                 unmap_src(dev->dev, dma_srcs, len, src_cnt);
-                               unmap_dst(dev->dev, dma_dsts, test_buf_size, i);
-                               pr_warn("%s: #%u: mapping error %d with "
-                                       "dst_off=0x%x len=0x%x\n",
-                                       thread_name, total_tests - 1, ret,
-                                       dst_off, test_buf_size);
+                               unmap_dst(dev->dev, dma_dsts, params->buf_size,
+                                         i);
+                               thread_result_add(info, result,
+                                                 DMATEST_ET_MAP_DST,
+                                                 total_tests, src_off, dst_off,
+                                                 len, ret);
                                 failed_tests++;
                                 continue;
                         }
@@ -428,11 +691,11 @@ static int dmatest_func(void *data)
  
                 if (!tx) {
                         unmap_src(dev->dev, dma_srcs, len, src_cnt);
-                       unmap_dst(dev->dev, dma_dsts, test_buf_size, dst_cnt);
-                       pr_warning("%s: #%u: prep error with src_off=0x%x "
-                                       "dst_off=0x%x len=0x%x\n",
-                                       thread_name, total_tests - 1,
-                                       src_off, dst_off, len);
+                       unmap_dst(dev->dev, dma_dsts, params->buf_size,
+                                 dst_cnt);
+                       thread_result_add(info, result, DMATEST_ET_PREP,
+                                         total_tests, src_off, dst_off,
+                                         len, 0);
                         msleep(100);
                         failed_tests++;
                         continue;
@@ -444,18 +707,18 @@ static int dmatest_func(void *data)
                 cookie = tx->tx_submit(tx);
  
                 if (dma_submit_error(cookie)) {
-                       pr_warning("%s: #%u: submit error %d with src_off=0x%x "
-                                       "dst_off=0x%x len=0x%x\n",
-                                       thread_name, total_tests - 1, cookie,
-                                       src_off, dst_off, len);
+                       thread_result_add(info, result, DMATEST_ET_SUBMIT,
+                                         total_tests, src_off, dst_off,
+                                         len, cookie);
                         msleep(100);
                         failed_tests++;
                         continue;
                 }
                 dma_async_issue_pending(chan);
  
-               wait_event_freezable_timeout(done_wait, done.done,
-                                            msecs_to_jiffies(timeout));
+               wait_event_freezable_timeout(done_wait,
+                                            done.done || kthread_should_stop(),
+                                            msecs_to_jiffies(params->timeout));
  
                 status = dma_async_is_tx_complete(chan, cookie, NULL, NULL);
  
@@ -468,56 +731,57 @@ static int dmatest_func(void *data)
                          * free it this time?" dancing.  For now, just
                          * leave it dangling.
                          */
-                       pr_warning("%s: #%u: test timed out\n",
-                                  thread_name, total_tests - 1);
+                       thread_result_add(info, result, DMATEST_ET_TIMEOUT,
+                                         total_tests, src_off, dst_off,
+                                         len, 0);
                         failed_tests++;
                         continue;
                 } else if (status != DMA_SUCCESS) {
-                       pr_warning("%s: #%u: got completion callback,"
-                                  " but status is \'%s\'\n",
-                                  thread_name, total_tests - 1,
-                                  status == DMA_ERROR ? "error" : "in progress");
+                       enum dmatest_error_type type = (status == DMA_ERROR) ?
+                               DMATEST_ET_DMA_ERROR : DMATEST_ET_DMA_IN_PROGRESS;
+                       thread_result_add(info, result, type,
+                                         total_tests, src_off, dst_off,
+                                         len, status);
                         failed_tests++;
                         continue;
                 }
  
                 /* Unmap by myself (see DMA_COMPL_SKIP_DEST_UNMAP above) */
-               unmap_dst(dev->dev, dma_dsts, test_buf_size, dst_cnt);
+               unmap_dst(dev->dev, dma_dsts, params->buf_size, dst_cnt);
  
                 error_count = 0;
  
                 pr_debug("%s: verifying source buffer...\n", thread_name);
-               error_count += dmatest_verify(thread->srcs, 0, src_off,
+               error_count += verify_result_add(info, result, total_tests,
+                               src_off, dst_off, len, thread->srcs, -1,
                                 0, PATTERN_SRC, true);
-               error_count += dmatest_verify(thread->srcs, src_off,
-                               src_off + len, src_off,
-                               PATTERN_SRC | PATTERN_COPY, true);
-               error_count += dmatest_verify(thread->srcs, src_off + len,
-                               test_buf_size, src_off + len,
-                               PATTERN_SRC, true);
-
-               pr_debug("%s: verifying dest buffer...\n",
-                               thread->task->comm);
-               error_count += dmatest_verify(thread->dsts, 0, dst_off,
+               error_count += verify_result_add(info, result, total_tests,
+                               src_off, dst_off, len, thread->srcs, 0,
+                               src_off, PATTERN_SRC | PATTERN_COPY, true);
+               error_count += verify_result_add(info, result, total_tests,
+                               src_off, dst_off, len, thread->srcs, 1,
+                               src_off + len, PATTERN_SRC, true);
+
+               pr_debug("%s: verifying dest buffer...\n", thread_name);
+               error_count += verify_result_add(info, result, total_tests,
+                               src_off, dst_off, len, thread->dsts, -1,
                                 0, PATTERN_DST, false);
-               error_count += dmatest_verify(thread->dsts, dst_off,
-                               dst_off + len, src_off,
-                               PATTERN_SRC | PATTERN_COPY, false);
-               error_count += dmatest_verify(thread->dsts, dst_off + len,
-                               test_buf_size, dst_off + len,
-                               PATTERN_DST, false);
+               error_count += verify_result_add(info, result, total_tests,
+                               src_off, dst_off, len, thread->dsts, 0,
+                               src_off, PATTERN_SRC | PATTERN_COPY, false);
+               error_count += verify_result_add(info, result, total_tests,
+                               src_off, dst_off, len, thread->dsts, 1,
+                               dst_off + len, PATTERN_DST, false);
  
                 if (error_count) {
-                       pr_warning("%s: #%u: %u errors with "
-                               "src_off=0x%x dst_off=0x%x len=0x%x\n",
-                               thread_name, total_tests - 1, error_count,
-                               src_off, dst_off, len);
+                       thread_result_add(info, result, DMATEST_ET_VERIFY,
+                                         total_tests, src_off, dst_off,
+                                         len, error_count);
                         failed_tests++;
                 } else {
-                       pr_debug("%s: #%u: No errors with "
-                               "src_off=0x%x dst_off=0x%x len=0x%x\n",
-                               thread_name, total_tests - 1,
-                               src_off, dst_off, len);
+                       thread_result_add(info, result, DMATEST_ET_OK,
+                                         total_tests, src_off, dst_off,
+                                         len, 0);
                 }
         }
  
@@ -532,6 +796,8 @@ err_dsts:
  err_srcbuf:
         kfree(thread->srcs);
  err_srcs:
+       kfree(pq_coefs);
+err_thread_type:
         pr_notice("%s: terminating after %u tests, %u failures (status %d)\n",
                         thread_name, total_tests, failed_tests, ret);
  
@@ -539,7 +805,9 @@ err_srcs:
         if (ret)
                 dmaengine_terminate_all(chan);
  
-       if (iterations > 0)
+       thread->done = true;
+
+       if (params->iterations > 0)
                 while (!kthread_should_stop()) {
                         DECLARE_WAIT_QUEUE_HEAD_ONSTACK(wait_dmatest_exit);
                         interruptible_sleep_on(&wait_dmatest_exit);
@@ -568,8 +836,10 @@ static void dmatest_cleanup_channel(struct dmatest_chan *dtc)
         kfree(dtc);
  }
  
-static int dmatest_add_threads(struct dmatest_chan *dtc, enum dma_transaction_type type)
+static int dmatest_add_threads(struct dmatest_info *info,
+               struct dmatest_chan *dtc, enum dma_transaction_type type)
  {
+       struct dmatest_params *params = &info->params;
         struct dmatest_thread *thread;
         struct dma_chan *chan = dtc->chan;
         char *op;
@@ -584,7 +854,7 @@ static int dmatest_add_threads(struct dmatest_chan *dtc, enum dma_transaction_ty
         else
                 return -EINVAL;
  
-       for (i = 0; i < threads_per_chan; i++) {
+       for (i = 0; i < params->threads_per_chan; i++) {
                 thread = kzalloc(sizeof(struct dmatest_thread), GFP_KERNEL);
                 if (!thread) {
                         pr_warning("dmatest: No memory for %s-%s%u\n",
@@ -592,6 +862,7 @@ static int dmatest_add_threads(struct dmatest_chan *dtc, enum dma_transaction_ty
  
                         break;
                 }
+               thread->info = info;
                 thread->chan = dtc->chan;
                 thread->type = type;
                 smp_wmb();
@@ -612,7 +883,8 @@ static int dmatest_add_threads(struct dmatest_chan *dtc, enum dma_transaction_ty
         return i;
  }
  
-static int dmatest_add_channel(struct dma_chan *chan)
+static int dmatest_add_channel(struct dmatest_info *info,
+               struct dma_chan *chan)
  {
         struct dmatest_chan     *dtc;
         struct dma_device       *dma_dev = chan->device;
@@ -629,75 +901,418 @@ static int dmatest_add_channel(struct dma_chan *chan)
         INIT_LIST_HEAD(&dtc->threads);
  
         if (dma_has_cap(DMA_MEMCPY, dma_dev->cap_mask)) {
-               cnt = dmatest_add_threads(dtc, DMA_MEMCPY);
+               cnt = dmatest_add_threads(info, dtc, DMA_MEMCPY);
                 thread_count += cnt > 0 ? cnt : 0;
         }
         if (dma_has_cap(DMA_XOR, dma_dev->cap_mask)) {
-               cnt = dmatest_add_threads(dtc, DMA_XOR);
+               cnt = dmatest_add_threads(info, dtc, DMA_XOR);
                 thread_count += cnt > 0 ? cnt : 0;
         }
         if (dma_has_cap(DMA_PQ, dma_dev->cap_mask)) {
-               cnt = dmatest_add_threads(dtc, DMA_PQ);
+               cnt = dmatest_add_threads(info, dtc, DMA_PQ);
                 thread_count += cnt > 0 ? cnt : 0;
         }
  
         pr_info("dmatest: Started %u threads using %s\n",
                 thread_count, dma_chan_name(chan));
  
-       list_add_tail(&dtc->node, &dmatest_channels);
-       nr_channels++;
+       list_add_tail(&dtc->node, &info->channels);
+       info->nr_channels++;
  
         return 0;
  }
  
  static bool filter(struct dma_chan *chan, void *param)
  {
-       if (!dmatest_match_channel(chan) || !dmatest_match_device(chan->device))
+       struct dmatest_params *params = param;
+
+       if (!dmatest_match_channel(params, chan) ||
+           !dmatest_match_device(params, chan->device))
                 return false;
         else
                 return true;
  }
  
-static int __init dmatest_init(void)
+static int __run_threaded_test(struct dmatest_info *info)
  {
         dma_cap_mask_t mask;
         struct dma_chan *chan;
+       struct dmatest_params *params = &info->params;
         int err = 0;
  
         dma_cap_zero(mask);
         dma_cap_set(DMA_MEMCPY, mask);
         for (;;) {
-               chan = dma_request_channel(mask, filter, NULL);
+               chan = dma_request_channel(mask, filter, params);
                 if (chan) {
-                       err = dmatest_add_channel(chan);
+                       err = dmatest_add_channel(info, chan);
                         if (err) {
                                 dma_release_channel(chan);
                                 break; /* add_channel failed, punt */
                         }
                 } else
                         break; /* no more channels available */
-               if (max_channels && nr_channels >= max_channels)
+               if (params->max_channels &&
+                   info->nr_channels >= params->max_channels)
                         break; /* we have all we need */
         }
-
         return err;
  }
-/* when compiled-in wait for drivers to load first */
-late_initcall(dmatest_init);
  
-static void __exit dmatest_exit(void)
+#ifndef MODULE
+static int run_threaded_test(struct dmatest_info *info)
+{
+       int ret;
+
+       mutex_lock(&info->lock);
+       ret = __run_threaded_test(info);
+       mutex_unlock(&info->lock);
+       return ret;
+}
+#endif
+
+static void __stop_threaded_test(struct dmatest_info *info)
  {
         struct dmatest_chan *dtc, *_dtc;
         struct dma_chan *chan;
  
-       list_for_each_entry_safe(dtc, _dtc, &dmatest_channels, node) {
+       list_for_each_entry_safe(dtc, _dtc, &info->channels, node) {
                 list_del(&dtc->node);
                 chan = dtc->chan;
                 dmatest_cleanup_channel(dtc);
-               pr_debug("dmatest: dropped channel %s\n",
-                        dma_chan_name(chan));
+               pr_debug("dmatest: dropped channel %s\n", dma_chan_name(chan));
                 dma_release_channel(chan);
         }
+
+       info->nr_channels = 0;
+}
+
+static void stop_threaded_test(struct dmatest_info *info)
+{
+       mutex_lock(&info->lock);
+       __stop_threaded_test(info);
+       mutex_unlock(&info->lock);
+}
+
+static int __restart_threaded_test(struct dmatest_info *info, bool run)
+{
+       struct dmatest_params *params = &info->params;
+       int ret;
+
+       /* Stop any running test first */
+       __stop_threaded_test(info);
+
+       if (run == false)
+               return 0;
+
+       /* Clear results from previous run */
+       result_free(info, NULL);
+
+       /* Copy test parameters */
+       memcpy(params, &info->dbgfs_params, sizeof(*params));
+
+       /* Run test with new parameters */
+       ret = __run_threaded_test(info);
+       if (ret) {
+               __stop_threaded_test(info);
+               pr_err("dmatest: Can't run test\n");
+       }
+
+       return ret;
+}
+
+static ssize_t dtf_write_string(void *to, size_t available, loff_t *ppos,
+               const void __user *from, size_t count)
+{
+       char tmp[20];
+       ssize_t len;
+
+       len = simple_write_to_buffer(tmp, sizeof(tmp) - 1, ppos, from, count);
+       if (len >= 0) {
+               tmp[len] = '\0';
+               strlcpy(to, strim(tmp), available);
+       }
+
+       return len;
+}
+
+static ssize_t dtf_read_channel(struct file *file, char __user *buf,
+               size_t count, loff_t *ppos)
+{
+       struct dmatest_info *info = file->private_data;
+       return simple_read_from_buffer(buf, count, ppos,
+                       info->dbgfs_params.channel,
+                       strlen(info->dbgfs_params.channel));
+}
+
+static ssize_t dtf_write_channel(struct file *file, const char __user *buf,
+               size_t size, loff_t *ppos)
+{
+       struct dmatest_info *info = file->private_data;
+       return dtf_write_string(info->dbgfs_params.channel,
+                               sizeof(info->dbgfs_params.channel),
+                               ppos, buf, size);
+}
+
+static const struct file_operations dtf_channel_fops = {
+       .read   = dtf_read_channel,
+       .write  = dtf_write_channel,
+       .open   = simple_open,
+       .llseek = default_llseek,
+};
+
+static ssize_t dtf_read_device(struct file *file, char __user *buf,
+               size_t count, loff_t *ppos)
+{
+       struct dmatest_info *info = file->private_data;
+       return simple_read_from_buffer(buf, count, ppos,
+                       info->dbgfs_params.device,
+                       strlen(info->dbgfs_params.device));
+}
+
+static ssize_t dtf_write_device(struct file *file, const char __user *buf,
+               size_t size, loff_t *ppos)
+{
+       struct dmatest_info *info = file->private_data;
+       return dtf_write_string(info->dbgfs_params.device,
+                               sizeof(info->dbgfs_params.device),
+                               ppos, buf, size);
+}
+
+static const struct file_operations dtf_device_fops = {
+       .read   = dtf_read_device,
+       .write  = dtf_write_device,
+       .open   = simple_open,
+       .llseek = default_llseek,
+};
+
+static ssize_t dtf_read_run(struct file *file, char __user *user_buf,
+               size_t count, loff_t *ppos)
+{
+       struct dmatest_info *info = file->private_data;
+       char buf[3];
+       struct dmatest_chan *dtc;
+       bool alive = false;
+
+       mutex_lock(&info->lock);
+       list_for_each_entry(dtc, &info->channels, node) {
+               struct dmatest_thread *thread;
+
+               list_for_each_entry(thread, &dtc->threads, node) {
+                       if (!thread->done) {
+                               alive = true;
+                               break;
+                       }
+               }
+       }
+
+       if (alive) {
+               buf[0] = 'Y';
+       } else {
+               __stop_threaded_test(info);
+               buf[0] = 'N';
+       }
+
+       mutex_unlock(&info->lock);
+       buf[1] = '\n';
+       buf[2] = 0x00;
+       return simple_read_from_buffer(user_buf, count, ppos, buf, 2);
+}
+
+static ssize_t dtf_write_run(struct file *file, const char __user *user_buf,
+               size_t count, loff_t *ppos)
+{
+       struct dmatest_info *info = file->private_data;
+       char buf[16];
+       bool bv;
+       int ret = 0;
+
+       if (copy_from_user(buf, user_buf, min(count, (sizeof(buf) - 1))))
+               return -EFAULT;
+
+       if (strtobool(buf, &bv) == 0) {
+               mutex_lock(&info->lock);
+               ret = __restart_threaded_test(info, bv);
+               mutex_unlock(&info->lock);
+       }
+
+       return ret ? ret : count;
+}
+
+static const struct file_operations dtf_run_fops = {
+       .read   = dtf_read_run,
+       .write  = dtf_write_run,
+       .open   = simple_open,
+       .llseek = default_llseek,
+};
+
+static int dtf_results_show(struct seq_file *sf, void *data)
+{
+       struct dmatest_info *info = sf->private;
+       struct dmatest_result *result;
+       struct dmatest_thread_result *tr;
+       unsigned int i;
+
+       mutex_lock(&info->results_lock);
+       list_for_each_entry(result, &info->results, node) {
+               list_for_each_entry(tr, &result->results, node) {
+                       seq_printf(sf, "%s\n",
+                               thread_result_get(result->name, tr));
+                       if (tr->type == DMATEST_ET_VERIFY_BUF) {
+                               for (i = 0; i < tr->vr->error_count; i++) {
+                                       seq_printf(sf, "\t%s\n",
+                                               verify_result_get_one(tr->vr, i));
+                               }
+                       }
+               }
+       }
+
+       mutex_unlock(&info->results_lock);
+       return 0;
+}
+
+static int dtf_results_open(struct inode *inode, struct file *file)
+{
+       return single_open(file, dtf_results_show, inode->i_private);
+}
+
+static const struct file_operations dtf_results_fops = {
+       .open           = dtf_results_open,
+       .read           = seq_read,
+       .llseek         = seq_lseek,
+       .release        = single_release,
+};
+
+static int dmatest_register_dbgfs(struct dmatest_info *info)
+{
+       struct dentry *d;
+       struct dmatest_params *params = &info->dbgfs_params;
+       int ret = -ENOMEM;
+
+       d = debugfs_create_dir("dmatest", NULL);
+       if (IS_ERR(d))
+               return PTR_ERR(d);
+       if (!d)
+               goto err_root;
+
+       info->root = d;
+
+       /* Copy initial values */
+       memcpy(params, &info->params, sizeof(*params));
+
+       /* Test parameters */
+
+       d = debugfs_create_u32("test_buf_size", S_IWUSR | S_IRUGO, info->root,
+                              (u32 *)&params->buf_size);
+       if (IS_ERR_OR_NULL(d))
+               goto err_node;
+
+       d = debugfs_create_file("channel", S_IRUGO | S_IWUSR, info->root,
+                               info, &dtf_channel_fops);
+       if (IS_ERR_OR_NULL(d))
+               goto err_node;
+
+       d = debugfs_create_file("device", S_IRUGO | S_IWUSR, info->root,
+                               info, &dtf_device_fops);
+       if (IS_ERR_OR_NULL(d))
+               goto err_node;
+
+       d = debugfs_create_u32("threads_per_chan", S_IWUSR | S_IRUGO, info->root,
+                              (u32 *)&params->threads_per_chan);
+       if (IS_ERR_OR_NULL(d))
+               goto err_node;
+
+       d = debugfs_create_u32("max_channels", S_IWUSR | S_IRUGO, info->root,
+                              (u32 *)&params->max_channels);
+       if (IS_ERR_OR_NULL(d))
+               goto err_node;
+
+       d = debugfs_create_u32("iterations", S_IWUSR | S_IRUGO, info->root,
+                              (u32 *)&params->iterations);
+       if (IS_ERR_OR_NULL(d))
+               goto err_node;
+
+       d = debugfs_create_u32("xor_sources", S_IWUSR | S_IRUGO, info->root,
+                              (u32 *)&params->xor_sources);
+       if (IS_ERR_OR_NULL(d))
+               goto err_node;
+
+       d = debugfs_create_u32("pq_sources", S_IWUSR | S_IRUGO, info->root,
+                              (u32 *)&params->pq_sources);
+       if (IS_ERR_OR_NULL(d))
+               goto err_node;
+
+       d = debugfs_create_u32("timeout", S_IWUSR | S_IRUGO, info->root,
+                              (u32 *)&params->timeout);
+       if (IS_ERR_OR_NULL(d))
+               goto err_node;
+
+       /* Run or stop threaded test */
+       d = debugfs_create_file("run", S_IWUSR | S_IRUGO, info->root,
+                               info, &dtf_run_fops);
+       if (IS_ERR_OR_NULL(d))
+               goto err_node;
+
+       /* Results of test in progress */
+       d = debugfs_create_file("results", S_IRUGO, info->root, info,
+                               &dtf_results_fops);
+       if (IS_ERR_OR_NULL(d))
+               goto err_node;
+
+       return 0;
+
+err_node:
+       debugfs_remove_recursive(info->root);
+err_root:
+       pr_err("dmatest: Failed to initialize debugfs\n");
+       return ret;
+}
+
+static int __init dmatest_init(void)
+{
+       struct dmatest_info *info = &test_info;
+       struct dmatest_params *params = &info->params;
+       int ret;
+
+       memset(info, 0, sizeof(*info));
+
+       mutex_init(&info->lock);
+       INIT_LIST_HEAD(&info->channels);
+
+       mutex_init(&info->results_lock);
+       INIT_LIST_HEAD(&info->results);
+
+       /* Set default parameters */
+       params->buf_size = test_buf_size;
+       strlcpy(params->channel, test_channel, sizeof(params->channel));
+       strlcpy(params->device, test_device, sizeof(params->device));
+       params->threads_per_chan = threads_per_chan;
+       params->max_channels = max_channels;
+       params->iterations = iterations;
+       params->xor_sources = xor_sources;
+       params->pq_sources = pq_sources;
+       params->timeout = timeout;
+
+       ret = dmatest_register_dbgfs(info);
+       if (ret)
+               return ret;
+
+#ifdef MODULE
+       return 0;
+#else
+       return run_threaded_test(info);
+#endif
+}
+/* when compiled-in wait for drivers to load first */
+late_initcall(dmatest_init);
+
+static void __exit dmatest_exit(void)
+{
+       struct dmatest_info *info = &test_info;
+
+       debugfs_remove_recursive(info->root);
+       stop_threaded_test(info);
+       result_free(info, NULL);
  }
  module_exit(dmatest_exit);
  
diff --git a/drivers/dma/dw_dmac.c b/drivers/dma/dw_dmac.c

index 43a5329d44837c4042687d6f93436b3caf0627c1..2e5deaa82b60579d24bd80fc42ac5327ac5e273f 100644 (file)
--- a/drivers/dma/dw_dmac.c
+++ b/drivers/dma/dw_dmac.c
@@ -25,6 +25,8 @@
  #include <linux/module.h>
  #include <linux/platform_device.h>
  #include <linux/slab.h>
+#include <linux/acpi.h>
+#include <linux/acpi_dma.h>
  
  #include "dw_dmac_regs.h"
  #include "dmaengine.h"
@@ -49,29 +51,22 @@ static inline unsigned int dwc_get_sms(struct dw_dma_slave *slave)
         return slave ? slave->src_master : 1;
  }
  
-#define SRC_MASTER     0
-#define DST_MASTER     1
-
-static inline unsigned int dwc_get_master(struct dma_chan *chan, int master)
+static inline void dwc_set_masters(struct dw_dma_chan *dwc)
  {
-       struct dw_dma *dw = to_dw_dma(chan->device);
-       struct dw_dma_slave *dws = chan->private;
-       unsigned int m;
-
-       if (master == SRC_MASTER)
-               m = dwc_get_sms(dws);
-       else
-               m = dwc_get_dms(dws);
+       struct dw_dma *dw = to_dw_dma(dwc->chan.device);
+       struct dw_dma_slave *dws = dwc->chan.private;
+       unsigned char mmax = dw->nr_masters - 1;
  
-       return min_t(unsigned int, dw->nr_masters - 1, m);
+       if (dwc->request_line == ~0) {
+               dwc->src_master = min_t(unsigned char, mmax, dwc_get_sms(dws));
+               dwc->dst_master = min_t(unsigned char, mmax, dwc_get_dms(dws));
+       }
  }
  
  #define DWC_DEFAULT_CTLLO(_chan) ({                            \
                 struct dw_dma_chan *_dwc = to_dw_dma_chan(_chan);       \
                 struct dma_slave_config *_sconfig = &_dwc->dma_sconfig; \
                 bool _is_slave = is_slave_direction(_dwc->direction);   \
-               int _dms = dwc_get_master(_chan, DST_MASTER);           \
-               int _sms = dwc_get_master(_chan, SRC_MASTER);           \
                 u8 _smsize = _is_slave ? _sconfig->src_maxburst :       \
                         DW_DMA_MSIZE_16;                        \
                 u8 _dmsize = _is_slave ? _sconfig->dst_maxburst :       \
@@ -81,8 +76,8 @@ static inline unsigned int dwc_get_master(struct dma_chan *chan, int master)
                  | DWC_CTLL_SRC_MSIZE(_smsize)                  \
                  | DWC_CTLL_LLP_D_EN                            \
                  | DWC_CTLL_LLP_S_EN                            \
-                | DWC_CTLL_DMS(_dms)                           \
-                | DWC_CTLL_SMS(_sms));                         \
+                | DWC_CTLL_DMS(_dwc->dst_master)               \
+                | DWC_CTLL_SMS(_dwc->src_master));             \
         })
  
  /*
@@ -92,13 +87,6 @@ static inline unsigned int dwc_get_master(struct dma_chan *chan, int master)
   */
  #define NR_DESCS_PER_CHANNEL   64
  
-static inline unsigned int dwc_get_data_width(struct dma_chan *chan, int master)
-{
-       struct dw_dma *dw = to_dw_dma(chan->device);
-
-       return dw->data_width[dwc_get_master(chan, master)];
-}
-
  /*----------------------------------------------------------------------*/
  
  static struct device *chan2dev(struct dma_chan *chan)
@@ -172,13 +160,7 @@ static void dwc_initialize(struct dw_dma_chan *dwc)
         if (dwc->initialized == true)
                 return;
  
-       if (dws && dws->cfg_hi == ~0 && dws->cfg_lo == ~0) {
-               /* autoconfigure based on request line from DT */
-               if (dwc->direction == DMA_MEM_TO_DEV)
-                       cfghi = DWC_CFGH_DST_PER(dwc->request_line);
-               else if (dwc->direction == DMA_DEV_TO_MEM)
-                       cfghi = DWC_CFGH_SRC_PER(dwc->request_line);
-       } else if (dws) {
+       if (dws) {
                 /*
                  * We need controller-specific data to set up slave
                  * transfers.
@@ -189,9 +171,9 @@ static void dwc_initialize(struct dw_dma_chan *dwc)
                 cfglo |= dws->cfg_lo & ~DWC_CFGL_CH_PRIOR_MASK;
         } else {
                 if (dwc->direction == DMA_MEM_TO_DEV)
-                       cfghi = DWC_CFGH_DST_PER(dwc->dma_sconfig.slave_id);
+                       cfghi = DWC_CFGH_DST_PER(dwc->request_line);
                 else if (dwc->direction == DMA_DEV_TO_MEM)
-                       cfghi = DWC_CFGH_SRC_PER(dwc->dma_sconfig.slave_id);
+                       cfghi = DWC_CFGH_SRC_PER(dwc->request_line);
         }
  
         channel_writel(dwc, CFG_LO, cfglo);
@@ -473,16 +455,16 @@ static void dwc_scan_descriptors(struct dw_dma *dw, struct dw_dma_chan *dwc)
                         (unsigned long long)llp);
  
         list_for_each_entry_safe(desc, _desc, &dwc->active_list, desc_node) {
-               /* initial residue value */
+               /* Initial residue value */
                 dwc->residue = desc->total_len;
  
-               /* check first descriptors addr */
+               /* Check first descriptors addr */
                 if (desc->txd.phys == llp) {
                         spin_unlock_irqrestore(&dwc->lock, flags);
                         return;
                 }
  
-               /* check first descriptors llp */
+               /* Check first descriptors llp */
                 if (desc->lli.llp == llp) {
                         /* This one is currently in progress */
                         dwc->residue -= dwc_get_sent(dwc);
@@ -588,7 +570,7 @@ inline dma_addr_t dw_dma_get_dst_addr(struct dma_chan *chan)
  }
  EXPORT_SYMBOL(dw_dma_get_dst_addr);
  
-/* called with dwc->lock held and all DMAC interrupts disabled */
+/* Called with dwc->lock held and all DMAC interrupts disabled */
  static void dwc_handle_cyclic(struct dw_dma *dw, struct dw_dma_chan *dwc,
                 u32 status_err, u32 status_xfer)
  {
@@ -626,7 +608,7 @@ static void dwc_handle_cyclic(struct dw_dma *dw, struct dw_dma_chan *dwc,
  
                 dwc_chan_disable(dw, dwc);
  
-               /* make sure DMA does not restart by loading a new list */
+               /* Make sure DMA does not restart by loading a new list */
                 channel_writel(dwc, LLP, 0);
                 channel_writel(dwc, CTL_LO, 0);
                 channel_writel(dwc, CTL_HI, 0);
@@ -745,6 +727,7 @@ dwc_prep_dma_memcpy(struct dma_chan *chan, dma_addr_t dest, dma_addr_t src,
                 size_t len, unsigned long flags)
  {
         struct dw_dma_chan      *dwc = to_dw_dma_chan(chan);
+       struct dw_dma           *dw = to_dw_dma(chan->device);
         struct dw_desc          *desc;
         struct dw_desc          *first;
         struct dw_desc          *prev;
@@ -767,8 +750,8 @@ dwc_prep_dma_memcpy(struct dma_chan *chan, dma_addr_t dest, dma_addr_t src,
  
         dwc->direction = DMA_MEM_TO_MEM;
  
-       data_width = min_t(unsigned int, dwc_get_data_width(chan, SRC_MASTER),
-                          dwc_get_data_width(chan, DST_MASTER));
+       data_width = min_t(unsigned int, dw->data_width[dwc->src_master],
+                          dw->data_width[dwc->dst_master]);
  
         src_width = dst_width = min_t(unsigned int, data_width,
                                       dwc_fast_fls(src | dest | len));
@@ -826,6 +809,7 @@ dwc_prep_slave_sg(struct dma_chan *chan, struct scatterlist *sgl,
                 unsigned long flags, void *context)
  {
         struct dw_dma_chan      *dwc = to_dw_dma_chan(chan);
+       struct dw_dma           *dw = to_dw_dma(chan->device);
         struct dma_slave_config *sconfig = &dwc->dma_sconfig;
         struct dw_desc          *prev;
         struct dw_desc          *first;
@@ -859,7 +843,7 @@ dwc_prep_slave_sg(struct dma_chan *chan, struct scatterlist *sgl,
                 ctllo |= sconfig->device_fc ? DWC_CTLL_FC(DW_DMA_FC_P_M2P) :
                         DWC_CTLL_FC(DW_DMA_FC_D_M2P);
  
-               data_width = dwc_get_data_width(chan, SRC_MASTER);
+               data_width = dw->data_width[dwc->src_master];
  
                 for_each_sg(sgl, sg, sg_len, i) {
                         struct dw_desc  *desc;
@@ -919,7 +903,7 @@ slave_sg_todev_fill_desc:
                 ctllo |= sconfig->device_fc ? DWC_CTLL_FC(DW_DMA_FC_P_P2M) :
                         DWC_CTLL_FC(DW_DMA_FC_D_P2M);
  
-               data_width = dwc_get_data_width(chan, DST_MASTER);
+               data_width = dw->data_width[dwc->dst_master];
  
                 for_each_sg(sgl, sg, sg_len, i) {
                         struct dw_desc  *desc;
@@ -1001,13 +985,6 @@ static inline void convert_burst(u32 *maxburst)
                 *maxburst = 0;
  }
  
-static inline void convert_slave_id(struct dw_dma_chan *dwc)
-{
-       struct dw_dma *dw = to_dw_dma(dwc->chan.device);
-
-       dwc->dma_sconfig.slave_id -= dw->request_line_base;
-}
-
  static int
  set_runtime_config(struct dma_chan *chan, struct dma_slave_config *sconfig)
  {
@@ -1020,9 +997,12 @@ set_runtime_config(struct dma_chan *chan, struct dma_slave_config *sconfig)
         memcpy(&dwc->dma_sconfig, sconfig, sizeof(*sconfig));
         dwc->direction = sconfig->direction;
  
+       /* Take the request line from slave_id member */
+       if (dwc->request_line == ~0)
+               dwc->request_line = sconfig->slave_id;
+
         convert_burst(&dwc->dma_sconfig.src_maxburst);
         convert_burst(&dwc->dma_sconfig.dst_maxburst);
-       convert_slave_id(dwc);
  
         return 0;
  }
@@ -1030,10 +1010,11 @@ set_runtime_config(struct dma_chan *chan, struct dma_slave_config *sconfig)
  static inline void dwc_chan_pause(struct dw_dma_chan *dwc)
  {
         u32 cfglo = channel_readl(dwc, CFG_LO);
+       unsigned int count = 20;        /* timeout iterations */
  
         channel_writel(dwc, CFG_LO, cfglo | DWC_CFGL_CH_SUSP);
-       while (!(channel_readl(dwc, CFG_LO) & DWC_CFGL_FIFO_EMPTY))
-               cpu_relax();
+       while (!(channel_readl(dwc, CFG_LO) & DWC_CFGL_FIFO_EMPTY) && count--)
+               udelay(2);
  
         dwc->paused = true;
  }
@@ -1169,6 +1150,8 @@ static int dwc_alloc_chan_resources(struct dma_chan *chan)
          * doesn't mean what you think it means), and status writeback.
          */
  
+       dwc_set_masters(dwc);
+
         spin_lock_irqsave(&dwc->lock, flags);
         i = dwc->descs_allocated;
         while (dwc->descs_allocated < NR_DESCS_PER_CHANNEL) {
@@ -1226,6 +1209,7 @@ static void dwc_free_chan_resources(struct dma_chan *chan)
         list_splice_init(&dwc->free_list, &list);
         dwc->descs_allocated = 0;
         dwc->initialized = false;
+       dwc->request_line = ~0;
  
         /* Disable interrupts */
         channel_clear_bit(dw, MASK.XFER, dwc->mask);
@@ -1241,42 +1225,36 @@ static void dwc_free_chan_resources(struct dma_chan *chan)
         dev_vdbg(chan2dev(chan), "%s: done\n", __func__);
  }
  
-struct dw_dma_filter_args {
+/*----------------------------------------------------------------------*/
+
+struct dw_dma_of_filter_args {
         struct dw_dma *dw;
         unsigned int req;
         unsigned int src;
         unsigned int dst;
  };
  
-static bool dw_dma_generic_filter(struct dma_chan *chan, void *param)
+static bool dw_dma_of_filter(struct dma_chan *chan, void *param)
  {
         struct dw_dma_chan *dwc = to_dw_dma_chan(chan);
-       struct dw_dma *dw = to_dw_dma(chan->device);
-       struct dw_dma_filter_args *fargs = param;
-       struct dw_dma_slave *dws = &dwc->slave;
+       struct dw_dma_of_filter_args *fargs = param;
  
-       /* ensure the device matches our channel */
+       /* Ensure the device matches our channel */
          if (chan->device != &fargs->dw->dma)
                  return false;
  
-       dws->dma_dev    = dw->dma.dev;
-       dws->cfg_hi     = ~0;
-       dws->cfg_lo     = ~0;
-       dws->src_master = fargs->src;
-       dws->dst_master = fargs->dst;
-
         dwc->request_line = fargs->req;
-
-       chan->private = dws;
+       dwc->src_master = fargs->src;
+       dwc->dst_master = fargs->dst;
  
         return true;
  }
  
-static struct dma_chan *dw_dma_xlate(struct of_phandle_args *dma_spec,
-                                        struct of_dma *ofdma)
+static struct dma_chan *dw_dma_of_xlate(struct of_phandle_args *dma_spec,
+                                       struct of_dma *ofdma)
  {
         struct dw_dma *dw = ofdma->of_dma_data;
-       struct dw_dma_filter_args fargs = {
+       struct dw_dma_of_filter_args fargs = {
                 .dw = dw,
         };
         dma_cap_mask_t cap;
@@ -1297,8 +1275,48 @@ static struct dma_chan *dw_dma_xlate(struct of_phandle_args *dma_spec,
         dma_cap_set(DMA_SLAVE, cap);
  
         /* TODO: there should be a simpler way to do this */
-       return dma_request_channel(cap, dw_dma_generic_filter, &fargs);
+       return dma_request_channel(cap, dw_dma_of_filter, &fargs);
+}
+
+#ifdef CONFIG_ACPI
+static bool dw_dma_acpi_filter(struct dma_chan *chan, void *param)
+{
+       struct dw_dma_chan *dwc = to_dw_dma_chan(chan);
+       struct acpi_dma_spec *dma_spec = param;
+
+       if (chan->device->dev != dma_spec->dev ||
+           chan->chan_id != dma_spec->chan_id)
+               return false;
+
+       dwc->request_line = dma_spec->slave_id;
+       dwc->src_master = dwc_get_sms(NULL);
+       dwc->dst_master = dwc_get_dms(NULL);
+
+       return true;
+}
+
+static void dw_dma_acpi_controller_register(struct dw_dma *dw)
+{
+       struct device *dev = dw->dma.dev;
+       struct acpi_dma_filter_info *info;
+       int ret;
+
+       info = devm_kzalloc(dev, sizeof(*info), GFP_KERNEL);
+       if (!info)
+               return;
+
+       dma_cap_zero(info->dma_cap);
+       dma_cap_set(DMA_SLAVE, info->dma_cap);
+       info->filter_fn = dw_dma_acpi_filter;
+
+       ret = devm_acpi_dma_controller_register(dev, acpi_dma_simple_xlate,
+                                               info);
+       if (ret)
+               dev_err(dev, "could not register acpi_dma_controller\n");
  }
+#else /* !CONFIG_ACPI */
+static inline void dw_dma_acpi_controller_register(struct dw_dma *dw) {}
+#endif /* !CONFIG_ACPI */
  
  /* --------------------- Cyclic DMA API extensions -------------------- */
  
@@ -1322,7 +1340,7 @@ int dw_dma_cyclic_start(struct dma_chan *chan)
  
         spin_lock_irqsave(&dwc->lock, flags);
  
-       /* assert channel is idle */
+       /* Assert channel is idle */
         if (dma_readl(dw, CH_EN) & dwc->mask) {
                 dev_err(chan2dev(&dwc->chan),
                         "BUG: Attempted to start non-idle channel\n");
@@ -1334,7 +1352,7 @@ int dw_dma_cyclic_start(struct dma_chan *chan)
         dma_writel(dw, CLEAR.ERROR, dwc->mask);
         dma_writel(dw, CLEAR.XFER, dwc->mask);
  
-       /* setup DMAC channel registers */
+       /* Setup DMAC channel registers */
         channel_writel(dwc, LLP, dwc->cdesc->desc[0]->txd.phys);
         channel_writel(dwc, CTL_LO, DWC_CTLL_LLP_D_EN | DWC_CTLL_LLP_S_EN);
         channel_writel(dwc, CTL_HI, 0);
@@ -1501,7 +1519,7 @@ struct dw_cyclic_desc *dw_dma_cyclic_prep(struct dma_chan *chan,
                 last = desc;
         }
  
-       /* lets make a cyclic list */
+       /* Let's make a cyclic list */
         last->lli.llp = cdesc->desc[0]->txd.phys;
  
         dev_dbg(chan2dev(&dwc->chan), "cyclic prepared buf 0x%llx len %zu "
@@ -1636,7 +1654,6 @@ dw_dma_parse_dt(struct platform_device *pdev)
  
  static int dw_probe(struct platform_device *pdev)
  {
-       const struct platform_device_id *match;
         struct dw_dma_platform_data *pdata;
         struct resource         *io;
         struct dw_dma           *dw;
@@ -1706,7 +1723,7 @@ static int dw_probe(struct platform_device *pdev)
  
         dw->regs = regs;
  
-       /* get hardware configuration parameters */
+       /* Get hardware configuration parameters */
         if (autocfg) {
                 max_blk_size = dma_readl(dw, MAX_BLK_SIZE);
  
@@ -1720,18 +1737,13 @@ static int dw_probe(struct platform_device *pdev)
                 memcpy(dw->data_width, pdata->data_width, 4);
         }
  
-       /* Get the base request line if set */
-       match = platform_get_device_id(pdev);
-       if (match)
-               dw->request_line_base = (unsigned int)match->driver_data;
-
         /* Calculate all channel mask before DMA setup */
         dw->all_chan_mask = (1 << nr_channels) - 1;
  
-       /* force dma off, just in case */
+       /* Force dma off, just in case */
         dw_dma_off(dw);
  
-       /* disable BLOCK interrupts as well */
+       /* Disable BLOCK interrupts as well */
         channel_clear_bit(dw, MASK.BLOCK, dw->all_chan_mask);
  
         err = devm_request_irq(&pdev->dev, irq, dw_dma_interrupt, 0,
@@ -1741,7 +1753,7 @@ static int dw_probe(struct platform_device *pdev)
  
         platform_set_drvdata(pdev, dw);
  
-       /* create a pool of consistent memory blocks for hardware descriptors */
+       /* Create a pool of consistent memory blocks for hardware descriptors */
         dw->desc_pool = dmam_pool_create("dw_dmac_desc_pool", &pdev->dev,
                                          sizeof(struct dw_desc), 4, 0);
         if (!dw->desc_pool) {
@@ -1781,8 +1793,9 @@ static int dw_probe(struct platform_device *pdev)
                 channel_clear_bit(dw, CH_EN, dwc->mask);
  
                 dwc->direction = DMA_TRANS_NONE;
+               dwc->request_line = ~0;
  
-               /* hardware configuration */
+               /* Hardware configuration */
                 if (autocfg) {
                         unsigned int dwc_params;
  
@@ -1842,12 +1855,15 @@ static int dw_probe(struct platform_device *pdev)
  
         if (pdev->dev.of_node) {
                 err = of_dma_controller_register(pdev->dev.of_node,
-                                                dw_dma_xlate, dw);
-               if (err && err != -ENODEV)
+                                                dw_dma_of_xlate, dw);
+               if (err)
                         dev_err(&pdev->dev,
                                 "could not register of_dma_controller\n");
         }
  
+       if (ACPI_HANDLE(&pdev->dev))
+               dw_dma_acpi_controller_register(dw);
+
         return 0;
  }
  
@@ -1912,18 +1928,19 @@ static const struct dev_pm_ops dw_dev_pm_ops = {
  };
  
  #ifdef CONFIG_OF
-static const struct of_device_id dw_dma_id_table[] = {
+static const struct of_device_id dw_dma_of_id_table[] = {
         { .compatible = "snps,dma-spear1340" },
         {}
  };
-MODULE_DEVICE_TABLE(of, dw_dma_id_table);
+MODULE_DEVICE_TABLE(of, dw_dma_of_id_table);
  #endif
  
-static const struct platform_device_id dw_dma_ids[] = {
-       /* Name,        Request Line Base */
-       { "INTL9C60",   (kernel_ulong_t)16 },
+#ifdef CONFIG_ACPI
+static const struct acpi_device_id dw_dma_acpi_id_table[] = {
+       { "INTL9C60", 0 },
         { }
  };
+#endif
  
  static struct platform_driver dw_driver = {
         .probe          = dw_probe,
@@ -1932,9 +1949,9 @@ static struct platform_driver dw_driver = {
         .driver = {
                 .name   = "dw_dmac",
                 .pm     = &dw_dev_pm_ops,
-               .of_match_table = of_match_ptr(dw_dma_id_table),
+               .of_match_table = of_match_ptr(dw_dma_of_id_table),
+               .acpi_match_table = ACPI_PTR(dw_dma_acpi_id_table),
         },
-       .id_table       = dw_dma_ids,
  };
  
  static int __init dw_init(void)
diff --git a/drivers/dma/dw_dmac_regs.h b/drivers/dma/dw_dmac_regs.h

index 4d02c3669b75bcf023a7f03916d02d3749ec2e1e..9d417200bd57f714b5493a589da7703252458658 100644 (file)
--- a/drivers/dma/dw_dmac_regs.h
+++ b/drivers/dma/dw_dmac_regs.h
@@ -212,8 +212,11 @@ struct dw_dma_chan {
         /* hardware configuration */
         unsigned int            block_size;
         bool                    nollp;
+
+       /* custom slave configuration */
         unsigned int            request_line;
-       struct dw_dma_slave     slave;
+       unsigned char           src_master;
+       unsigned char           dst_master;
  
         /* configuration passed via DMA_SLAVE_CONFIG */
         struct dma_slave_config dma_sconfig;
@@ -247,7 +250,6 @@ struct dw_dma {
         /* hardware configuration */
         unsigned char           nr_masters;
         unsigned char           data_width[4];
-       unsigned int            request_line_base;
  
         struct dw_dma_chan      chan[0];
  };
diff --git a/drivers/dma/imx-dma.c b/drivers/dma/imx-dma.c

index 70b8975d107ee185e9b4b3bcf6e220ff9b4d555e..f28583370d00f7d102b47b39241d1b5d63885a80 100644 (file)
--- a/drivers/dma/imx-dma.c
+++ b/drivers/dma/imx-dma.c
@@ -859,8 +859,7 @@ static struct dma_async_tx_descriptor *imxdma_prep_dma_cyclic(
  
         desc = list_first_entry(&imxdmac->ld_free, struct imxdma_desc, node);
  
-       if (imxdmac->sg_list)
-               kfree(imxdmac->sg_list);
+       kfree(imxdmac->sg_list);
  
         imxdmac->sg_list = kcalloc(periods + 1,
                         sizeof(struct scatterlist), GFP_KERNEL);
@@ -1145,7 +1144,7 @@ err:
         return ret;
  }
  
-static int __exit imxdma_remove(struct platform_device *pdev)
+static int imxdma_remove(struct platform_device *pdev)
  {
         struct imxdma_engine *imxdma = platform_get_drvdata(pdev);
  
@@ -1162,7 +1161,7 @@ static struct platform_driver imxdma_driver = {
                 .name   = "imx-dma",
         },
         .id_table       = imx_dma_devtype,
-       .remove         = __exit_p(imxdma_remove),
+       .remove         = imxdma_remove,
  };
  
  static int __init imxdma_module_init(void)
diff --git a/drivers/dma/imx-sdma.c b/drivers/dma/imx-sdma.c

index f082aa3a918c9afa82fa459849c96d63f81242b6..092867bf795c0d939b57ef3a9e4622c833f463a1 100644 (file)
--- a/drivers/dma/imx-sdma.c
+++ b/drivers/dma/imx-sdma.c
@@ -1462,7 +1462,7 @@ err_irq:
         return ret;
  }
  
-static int __exit sdma_remove(struct platform_device *pdev)
+static int sdma_remove(struct platform_device *pdev)
  {
         return -EBUSY;
  }
@@ -1473,7 +1473,7 @@ static struct platform_driver sdma_driver = {
                 .of_match_table = sdma_dt_ids,
         },
         .id_table       = sdma_devtypes,
-       .remove         = __exit_p(sdma_remove),
+       .remove         = sdma_remove,
  };
  
  static int __init sdma_module_init(void)
diff --git a/drivers/dma/ioat/dma.c b/drivers/dma/ioat/dma.c

index 1879a5942bfc73ffb74e498cdcb7229ee0a52aaf..17a2393b3e25048fb495b9049941fe1be8f4cf36 100644 (file)
--- a/drivers/dma/ioat/dma.c
+++ b/drivers/dma/ioat/dma.c
@@ -892,7 +892,7 @@ MODULE_PARM_DESC(ioat_interrupt_style,
   * ioat_dma_setup_interrupts - setup interrupt handler
   * @device: ioat device
   */
-static int ioat_dma_setup_interrupts(struct ioatdma_device *device)
+int ioat_dma_setup_interrupts(struct ioatdma_device *device)
  {
         struct ioat_chan_common *chan;
         struct pci_dev *pdev = device->pdev;
@@ -941,6 +941,7 @@ msix:
                 }
         }
         intrctrl |= IOAT_INTRCTRL_MSIX_VECTOR_CONTROL;
+       device->irq_mode = IOAT_MSIX;
         goto done;
  
  msix_single_vector:
@@ -956,6 +957,7 @@ msix_single_vector:
                 pci_disable_msix(pdev);
                 goto msi;
         }
+       device->irq_mode = IOAT_MSIX_SINGLE;
         goto done;
  
  msi:
@@ -969,6 +971,7 @@ msi:
                 pci_disable_msi(pdev);
                 goto intx;
         }
+       device->irq_mode = IOAT_MSIX;
         goto done;
  
  intx:
@@ -977,6 +980,7 @@ intx:
         if (err)
                 goto err_no_irq;
  
+       device->irq_mode = IOAT_INTX;
  done:
         if (device->intr_quirk)
                 device->intr_quirk(device);
@@ -987,9 +991,11 @@ done:
  err_no_irq:
         /* Disable all interrupt generation */
         writeb(0, device->reg_base + IOAT_INTRCTRL_OFFSET);
+       device->irq_mode = IOAT_NOIRQ;
         dev_err(dev, "no usable interrupts\n");
         return err;
  }
+EXPORT_SYMBOL(ioat_dma_setup_interrupts);
  
  static void ioat_disable_interrupts(struct ioatdma_device *device)
  {
diff --git a/drivers/dma/ioat/dma.h b/drivers/dma/ioat/dma.h

index 53a4cbb78f4790f24be88100f6661805942671f5..54fb7b9ff9aaa4afb88c823b3a129a22440a9320 100644 (file)
--- a/drivers/dma/ioat/dma.h
+++ b/drivers/dma/ioat/dma.h
@@ -39,6 +39,7 @@
  #define to_ioat_desc(lh) container_of(lh, struct ioat_desc_sw, node)
  #define tx_to_ioat_desc(tx) container_of(tx, struct ioat_desc_sw, txd)
  #define to_dev(ioat_chan) (&(ioat_chan)->device->pdev->dev)
+#define to_pdev(ioat_chan) ((ioat_chan)->device->pdev)
  
  #define chan_num(ch) ((int)((ch)->reg_base - (ch)->device->reg_base) / 0x80)
  
@@ -48,6 +49,14 @@
   */
  #define NULL_DESC_BUFFER_SIZE 1
  
+enum ioat_irq_mode {
+       IOAT_NOIRQ = 0,
+       IOAT_MSIX,
+       IOAT_MSIX_SINGLE,
+       IOAT_MSI,
+       IOAT_INTX
+};
+
  /**
   * struct ioatdma_device - internal representation of a IOAT device
   * @pdev: PCI-Express device
@@ -72,11 +81,16 @@ struct ioatdma_device {
         void __iomem *reg_base;
         struct pci_pool *dma_pool;
         struct pci_pool *completion_pool;
+#define MAX_SED_POOLS  5
+       struct dma_pool *sed_hw_pool[MAX_SED_POOLS];
+       struct kmem_cache *sed_pool;
         struct dma_device common;
         u8 version;
         struct msix_entry msix_entries[4];
         struct ioat_chan_common *idx[4];
         struct dca_provider *dca;
+       enum ioat_irq_mode irq_mode;
+       u32 cap;
         void (*intr_quirk)(struct ioatdma_device *device);
         int (*enumerate_channels)(struct ioatdma_device *device);
         int (*reset_hw)(struct ioat_chan_common *chan);
@@ -131,6 +145,20 @@ struct ioat_dma_chan {
         u16 active;
  };
  
+/**
+ * struct ioat_sed_ent - wrapper around super extended hardware descriptor
+ * @hw: hardware SED
+ * @sed_dma: dma address for the SED
+ * @list: list member
+ * @parent: point to the dma descriptor that's the parent
+ */
+struct ioat_sed_ent {
+       struct ioat_sed_raw_descriptor *hw;
+       dma_addr_t dma;
+       struct ioat_ring_ent *parent;
+       unsigned int hw_pool;
+};
+
  static inline struct ioat_chan_common *to_chan_common(struct dma_chan *c)
  {
         return container_of(c, struct ioat_chan_common, common);
@@ -179,7 +207,7 @@ __dump_desc_dbg(struct ioat_chan_common *chan, struct ioat_dma_descriptor *hw,
         struct device *dev = to_dev(chan);
  
         dev_dbg(dev, "desc[%d]: (%#llx->%#llx) cookie: %d flags: %#x"
-               " ctl: %#x (op: %d int_en: %d compl: %d)\n", id,
+               " ctl: %#10.8x (op: %#x int_en: %d compl: %d)\n", id,
                 (unsigned long long) tx->phys,
                 (unsigned long long) hw->next, tx->cookie, tx->flags,
                 hw->ctl, hw->ctl_f.op, hw->ctl_f.int_en, hw->ctl_f.compl_write);
@@ -201,7 +229,7 @@ ioat_chan_by_index(struct ioatdma_device *device, int index)
         return device->idx[index];
  }
  
-static inline u64 ioat_chansts(struct ioat_chan_common *chan)
+static inline u64 ioat_chansts_32(struct ioat_chan_common *chan)
  {
         u8 ver = chan->device->version;
         u64 status;
@@ -218,6 +246,26 @@ static inline u64 ioat_chansts(struct ioat_chan_common *chan)
         return status;
  }
  
+#if BITS_PER_LONG == 64
+
+static inline u64 ioat_chansts(struct ioat_chan_common *chan)
+{
+       u8 ver = chan->device->version;
+       u64 status;
+
+        /* With IOAT v3.3 the status register is 64bit.  */
+       if (ver >= IOAT_VER_3_3)
+               status = readq(chan->reg_base + IOAT_CHANSTS_OFFSET(ver));
+       else
+               status = ioat_chansts_32(chan);
+
+       return status;
+}
+
+#else
+#define ioat_chansts ioat_chansts_32
+#endif
+
  static inline void ioat_start(struct ioat_chan_common *chan)
  {
         u8 ver = chan->device->version;
@@ -321,6 +369,7 @@ bool ioat_cleanup_preamble(struct ioat_chan_common *chan,
                            dma_addr_t *phys_complete);
  void ioat_kobject_add(struct ioatdma_device *device, struct kobj_type *type);
  void ioat_kobject_del(struct ioatdma_device *device);
+int ioat_dma_setup_interrupts(struct ioatdma_device *device);
  extern const struct sysfs_ops ioat_sysfs_ops;
  extern struct ioat_sysfs_entry ioat_version_attr;
  extern struct ioat_sysfs_entry ioat_cap_attr;
diff --git a/drivers/dma/ioat/dma_v2.h b/drivers/dma/ioat/dma_v2.h

index e100f644e3446449d09cc09dbc639d096d75d305..29bf9448035d321305e9c8791265e6c744c5a677 100644 (file)
--- a/drivers/dma/ioat/dma_v2.h
+++ b/drivers/dma/ioat/dma_v2.h
@@ -137,6 +137,7 @@ struct ioat_ring_ent {
         #ifdef DEBUG
         int id;
         #endif
+       struct ioat_sed_ent *sed;
  };
  
  static inline struct ioat_ring_ent *
@@ -157,6 +158,7 @@ static inline void ioat2_set_chainaddr(struct ioat2_dma_chan *ioat, u64 addr)
  
  int ioat2_dma_probe(struct ioatdma_device *dev, int dca);
  int ioat3_dma_probe(struct ioatdma_device *dev, int dca);
+void ioat3_dma_remove(struct ioatdma_device *dev);
  struct dca_provider *ioat2_dca_init(struct pci_dev *pdev, void __iomem *iobase);
  struct dca_provider *ioat3_dca_init(struct pci_dev *pdev, void __iomem *iobase);
  int ioat2_check_space_lock(struct ioat2_dma_chan *ioat, int num_descs);
diff --git a/drivers/dma/ioat/dma_v3.c b/drivers/dma/ioat/dma_v3.c

index e8336cce360b8f03a504890dc5ee426a2179f3ff..ca6ea9b3551b3f0307b1440a55dcd92a64e2f466 100644 (file)
--- a/drivers/dma/ioat/dma_v3.c
+++ b/drivers/dma/ioat/dma_v3.c
@@ -55,7 +55,7 @@
  /*
   * Support routines for v3+ hardware
   */
-
+#include <linux/module.h>
  #include <linux/pci.h>
  #include <linux/gfp.h>
  #include <linux/dmaengine.h>
@@ -70,6 +70,10 @@
  /* ioat hardware assumes at least two sources for raid operations */
  #define src_cnt_to_sw(x) ((x) + 2)
  #define src_cnt_to_hw(x) ((x) - 2)
+#define ndest_to_sw(x) ((x) + 1)
+#define ndest_to_hw(x) ((x) - 1)
+#define src16_cnt_to_sw(x) ((x) + 9)
+#define src16_cnt_to_hw(x) ((x) - 9)
  
  /* provide a lookup table for setting the source address in the base or
   * extended descriptor of an xor or pq descriptor
@@ -77,7 +81,20 @@
  static const u8 xor_idx_to_desc = 0xe0;
  static const u8 xor_idx_to_field[] = { 1, 4, 5, 6, 7, 0, 1, 2 };
  static const u8 pq_idx_to_desc = 0xf8;
+static const u8 pq16_idx_to_desc[] = { 0, 0, 1, 1, 1, 1, 1, 1, 1,
+                                      2, 2, 2, 2, 2, 2, 2 };
  static const u8 pq_idx_to_field[] = { 1, 4, 5, 0, 1, 2, 4, 5 };
+static const u8 pq16_idx_to_field[] = { 1, 4, 1, 2, 3, 4, 5, 6, 7,
+                                       0, 1, 2, 3, 4, 5, 6 };
+
+/*
+ * technically sources 1 and 2 do not require SED, but the op will have
+ * at least 9 descriptors so that's irrelevant.
+ */
+static const u8 pq16_idx_to_sed[] = { 0, 0, 0, 0, 0, 0, 0, 0, 0,
+                                     1, 1, 1, 1, 1, 1, 1 };
+
+static void ioat3_eh(struct ioat2_dma_chan *ioat);
  
  static dma_addr_t xor_get_src(struct ioat_raw_descriptor *descs[2], int idx)
  {
@@ -101,6 +118,13 @@ static dma_addr_t pq_get_src(struct ioat_raw_descriptor *descs[2], int idx)
         return raw->field[pq_idx_to_field[idx]];
  }
  
+static dma_addr_t pq16_get_src(struct ioat_raw_descriptor *desc[3], int idx)
+{
+       struct ioat_raw_descriptor *raw = desc[pq16_idx_to_desc[idx]];
+
+       return raw->field[pq16_idx_to_field[idx]];
+}
+
  static void pq_set_src(struct ioat_raw_descriptor *descs[2],
                        dma_addr_t addr, u32 offset, u8 coef, int idx)
  {
@@ -111,6 +135,167 @@ static void pq_set_src(struct ioat_raw_descriptor *descs[2],
         pq->coef[idx] = coef;
  }
  
+static int sed_get_pq16_pool_idx(int src_cnt)
+{
+
+       return pq16_idx_to_sed[src_cnt];
+}
+
+static bool is_jf_ioat(struct pci_dev *pdev)
+{
+       switch (pdev->device) {
+       case PCI_DEVICE_ID_INTEL_IOAT_JSF0:
+       case PCI_DEVICE_ID_INTEL_IOAT_JSF1:
+       case PCI_DEVICE_ID_INTEL_IOAT_JSF2:
+       case PCI_DEVICE_ID_INTEL_IOAT_JSF3:
+       case PCI_DEVICE_ID_INTEL_IOAT_JSF4:
+       case PCI_DEVICE_ID_INTEL_IOAT_JSF5:
+       case PCI_DEVICE_ID_INTEL_IOAT_JSF6:
+       case PCI_DEVICE_ID_INTEL_IOAT_JSF7:
+       case PCI_DEVICE_ID_INTEL_IOAT_JSF8:
+       case PCI_DEVICE_ID_INTEL_IOAT_JSF9:
+               return true;
+       default:
+               return false;
+       }
+}
+
+static bool is_snb_ioat(struct pci_dev *pdev)
+{
+       switch (pdev->device) {
+       case PCI_DEVICE_ID_INTEL_IOAT_SNB0:
+       case PCI_DEVICE_ID_INTEL_IOAT_SNB1:
+       case PCI_DEVICE_ID_INTEL_IOAT_SNB2:
+       case PCI_DEVICE_ID_INTEL_IOAT_SNB3:
+       case PCI_DEVICE_ID_INTEL_IOAT_SNB4:
+       case PCI_DEVICE_ID_INTEL_IOAT_SNB5:
+       case PCI_DEVICE_ID_INTEL_IOAT_SNB6:
+       case PCI_DEVICE_ID_INTEL_IOAT_SNB7:
+       case PCI_DEVICE_ID_INTEL_IOAT_SNB8:
+       case PCI_DEVICE_ID_INTEL_IOAT_SNB9:
+               return true;
+       default:
+               return false;
+       }
+}
+
+static bool is_ivb_ioat(struct pci_dev *pdev)
+{
+       switch (pdev->device) {
+       case PCI_DEVICE_ID_INTEL_IOAT_IVB0:
+       case PCI_DEVICE_ID_INTEL_IOAT_IVB1:
+       case PCI_DEVICE_ID_INTEL_IOAT_IVB2:
+       case PCI_DEVICE_ID_INTEL_IOAT_IVB3:
+       case PCI_DEVICE_ID_INTEL_IOAT_IVB4:
+       case PCI_DEVICE_ID_INTEL_IOAT_IVB5:
+       case PCI_DEVICE_ID_INTEL_IOAT_IVB6:
+       case PCI_DEVICE_ID_INTEL_IOAT_IVB7:
+       case PCI_DEVICE_ID_INTEL_IOAT_IVB8:
+       case PCI_DEVICE_ID_INTEL_IOAT_IVB9:
+               return true;
+       default:
+               return false;
+       }
+
+}
+
+static bool is_hsw_ioat(struct pci_dev *pdev)
+{
+       switch (pdev->device) {
+       case PCI_DEVICE_ID_INTEL_IOAT_HSW0:
+       case PCI_DEVICE_ID_INTEL_IOAT_HSW1:
+       case PCI_DEVICE_ID_INTEL_IOAT_HSW2:
+       case PCI_DEVICE_ID_INTEL_IOAT_HSW3:
+       case PCI_DEVICE_ID_INTEL_IOAT_HSW4:
+       case PCI_DEVICE_ID_INTEL_IOAT_HSW5:
+       case PCI_DEVICE_ID_INTEL_IOAT_HSW6:
+       case PCI_DEVICE_ID_INTEL_IOAT_HSW7:
+       case PCI_DEVICE_ID_INTEL_IOAT_HSW8:
+       case PCI_DEVICE_ID_INTEL_IOAT_HSW9:
+               return true;
+       default:
+               return false;
+       }
+
+}
+
+static bool is_xeon_cb32(struct pci_dev *pdev)
+{
+       return is_jf_ioat(pdev) || is_snb_ioat(pdev) || is_ivb_ioat(pdev) ||
+               is_hsw_ioat(pdev);
+}
+
+static bool is_bwd_ioat(struct pci_dev *pdev)
+{
+       switch (pdev->device) {
+       case PCI_DEVICE_ID_INTEL_IOAT_BWD0:
+       case PCI_DEVICE_ID_INTEL_IOAT_BWD1:
+       case PCI_DEVICE_ID_INTEL_IOAT_BWD2:
+       case PCI_DEVICE_ID_INTEL_IOAT_BWD3:
+               return true;
+       default:
+               return false;
+       }
+}
+
+static bool is_bwd_noraid(struct pci_dev *pdev)
+{
+       switch (pdev->device) {
+       case PCI_DEVICE_ID_INTEL_IOAT_BWD2:
+       case PCI_DEVICE_ID_INTEL_IOAT_BWD3:
+               return true;
+       default:
+               return false;
+       }
+
+}
+
+static void pq16_set_src(struct ioat_raw_descriptor *desc[3],
+                       dma_addr_t addr, u32 offset, u8 coef, int idx)
+{
+       struct ioat_pq_descriptor *pq = (struct ioat_pq_descriptor *)desc[0];
+       struct ioat_pq16a_descriptor *pq16 =
+               (struct ioat_pq16a_descriptor *)desc[1];
+       struct ioat_raw_descriptor *raw = desc[pq16_idx_to_desc[idx]];
+
+       raw->field[pq16_idx_to_field[idx]] = addr + offset;
+
+       if (idx < 8)
+               pq->coef[idx] = coef;
+       else
+               pq16->coef[idx - 8] = coef;
+}
+
+static struct ioat_sed_ent *
+ioat3_alloc_sed(struct ioatdma_device *device, unsigned int hw_pool)
+{
+       struct ioat_sed_ent *sed;
+       gfp_t flags = __GFP_ZERO | GFP_ATOMIC;
+
+       sed = kmem_cache_alloc(device->sed_pool, flags);
+       if (!sed)
+               return NULL;
+
+       sed->hw_pool = hw_pool;
+       sed->hw = dma_pool_alloc(device->sed_hw_pool[hw_pool],
+                                flags, &sed->dma);
+       if (!sed->hw) {
+               kmem_cache_free(device->sed_pool, sed);
+               return NULL;
+       }
+
+       return sed;
+}
+
+static void ioat3_free_sed(struct ioatdma_device *device, struct ioat_sed_ent *sed)
+{
+       if (!sed)
+               return;
+
+       dma_pool_free(device->sed_hw_pool[sed->hw_pool], sed->hw, sed->dma);
+       kmem_cache_free(device->sed_pool, sed);
+}
+
  static void ioat3_dma_unmap(struct ioat2_dma_chan *ioat,
                             struct ioat_ring_ent *desc, int idx)
  {
@@ -223,6 +408,54 @@ static void ioat3_dma_unmap(struct ioat2_dma_chan *ioat,
                 }
                 break;
         }
+       case IOAT_OP_PQ_16S:
+       case IOAT_OP_PQ_VAL_16S: {
+               struct ioat_pq_descriptor *pq = desc->pq;
+               int src_cnt = src16_cnt_to_sw(pq->ctl_f.src_cnt);
+               struct ioat_raw_descriptor *descs[4];
+               int i;
+
+               /* in the 'continue' case don't unmap the dests as sources */
+               if (dmaf_p_disabled_continue(flags))
+                       src_cnt--;
+               else if (dmaf_continue(flags))
+                       src_cnt -= 3;
+
+               if (!(flags & DMA_COMPL_SKIP_SRC_UNMAP)) {
+                       descs[0] = (struct ioat_raw_descriptor *)pq;
+                       descs[1] = (struct ioat_raw_descriptor *)(desc->sed->hw);
+                       descs[2] = (struct ioat_raw_descriptor *)(&desc->sed->hw->b[0]);
+                       for (i = 0; i < src_cnt; i++) {
+                               dma_addr_t src = pq16_get_src(descs, i);
+
+                               ioat_unmap(pdev, src - offset, len,
+                                          PCI_DMA_TODEVICE, flags, 0);
+                       }
+
+                       /* the dests are sources in pq validate operations */
+                       if (pq->ctl_f.op == IOAT_OP_XOR_VAL) {
+                               if (!(flags & DMA_PREP_PQ_DISABLE_P))
+                                       ioat_unmap(pdev, pq->p_addr - offset,
+                                                  len, PCI_DMA_TODEVICE,
+                                                  flags, 0);
+                               if (!(flags & DMA_PREP_PQ_DISABLE_Q))
+                                       ioat_unmap(pdev, pq->q_addr - offset,
+                                                  len, PCI_DMA_TODEVICE,
+                                                  flags, 0);
+                               break;
+                       }
+               }
+
+               if (!(flags & DMA_COMPL_SKIP_DEST_UNMAP)) {
+                       if (!(flags & DMA_PREP_PQ_DISABLE_P))
+                               ioat_unmap(pdev, pq->p_addr - offset, len,
+                                          PCI_DMA_BIDIRECTIONAL, flags, 1);
+                       if (!(flags & DMA_PREP_PQ_DISABLE_Q))
+                               ioat_unmap(pdev, pq->q_addr - offset, len,
+                                          PCI_DMA_BIDIRECTIONAL, flags, 1);
+               }
+               break;
+       }
         default:
                 dev_err(&pdev->dev, "%s: unknown op type: %#x\n",
                         __func__, desc->hw->ctl_f.op);
@@ -250,6 +483,63 @@ static bool desc_has_ext(struct ioat_ring_ent *desc)
         return false;
  }
  
+static u64 ioat3_get_current_completion(struct ioat_chan_common *chan)
+{
+       u64 phys_complete;
+       u64 completion;
+
+       completion = *chan->completion;
+       phys_complete = ioat_chansts_to_addr(completion);
+
+       dev_dbg(to_dev(chan), "%s: phys_complete: %#llx\n", __func__,
+               (unsigned long long) phys_complete);
+
+       return phys_complete;
+}
+
+static bool ioat3_cleanup_preamble(struct ioat_chan_common *chan,
+                                  u64 *phys_complete)
+{
+       *phys_complete = ioat3_get_current_completion(chan);
+       if (*phys_complete == chan->last_completion)
+               return false;
+
+       clear_bit(IOAT_COMPLETION_ACK, &chan->state);
+       mod_timer(&chan->timer, jiffies + COMPLETION_TIMEOUT);
+
+       return true;
+}
+
+static void
+desc_get_errstat(struct ioat2_dma_chan *ioat, struct ioat_ring_ent *desc)
+{
+       struct ioat_dma_descriptor *hw = desc->hw;
+
+       switch (hw->ctl_f.op) {
+       case IOAT_OP_PQ_VAL:
+       case IOAT_OP_PQ_VAL_16S:
+       {
+               struct ioat_pq_descriptor *pq = desc->pq;
+
+               /* check if there's error written */
+               if (!pq->dwbes_f.wbes)
+                       return;
+
+               /* need to set a chanerr var for checking to clear later */
+
+               if (pq->dwbes_f.p_val_err)
+                       *desc->result |= SUM_CHECK_P_RESULT;
+
+               if (pq->dwbes_f.q_val_err)
+                       *desc->result |= SUM_CHECK_Q_RESULT;
+
+               return;
+       }
+       default:
+               return;
+       }
+}
+
  /**
   * __cleanup - reclaim used descriptors
   * @ioat: channel (ring) to clean
@@ -260,6 +550,7 @@ static bool desc_has_ext(struct ioat_ring_ent *desc)
  static void __cleanup(struct ioat2_dma_chan *ioat, dma_addr_t phys_complete)
  {
         struct ioat_chan_common *chan = &ioat->base;
+       struct ioatdma_device *device = chan->device;
         struct ioat_ring_ent *desc;
         bool seen_current = false;
         int idx = ioat->tail, i;
@@ -268,6 +559,16 @@ static void __cleanup(struct ioat2_dma_chan *ioat, dma_addr_t phys_complete)
         dev_dbg(to_dev(chan), "%s: head: %#x tail: %#x issued: %#x\n",
                 __func__, ioat->head, ioat->tail, ioat->issued);
  
+       /*
+        * At restart of the channel, the completion address and the
+        * channel status will be 0 due to starting a new chain. Since
+        * it's new chain and the first descriptor "fails", there is
+        * nothing to clean up. We do not want to reap the entire submitted
+        * chain due to this 0 address value and then BUG.
+        */
+       if (!phys_complete)
+               return;
+
         active = ioat2_ring_active(ioat);
         for (i = 0; i < active && !seen_current; i++) {
                 struct dma_async_tx_descriptor *tx;
@@ -276,6 +577,11 @@ static void __cleanup(struct ioat2_dma_chan *ioat, dma_addr_t phys_complete)
                 prefetch(ioat2_get_ring_ent(ioat, idx + i + 1));
                 desc = ioat2_get_ring_ent(ioat, idx + i);
                 dump_desc_dbg(ioat, desc);
+
+               /* set err stat if we are using dwbes */
+               if (device->cap & IOAT_CAP_DWBES)
+                       desc_get_errstat(ioat, desc);
+
                 tx = &desc->txd;
                 if (tx->cookie) {
                         dma_cookie_complete(tx);
@@ -294,6 +600,12 @@ static void __cleanup(struct ioat2_dma_chan *ioat, dma_addr_t phys_complete)
                         BUG_ON(i + 1 >= active);
                         i++;
                 }
+
+               /* cleanup super extended descriptors */
+               if (desc->sed) {
+                       ioat3_free_sed(device, desc->sed);
+                       desc->sed = NULL;
+               }
         }
         smp_mb(); /* finish all descriptor reads before incrementing tail */
         ioat->tail = idx + i;
@@ -314,11 +626,22 @@ static void __cleanup(struct ioat2_dma_chan *ioat, dma_addr_t phys_complete)
  static void ioat3_cleanup(struct ioat2_dma_chan *ioat)
  {
         struct ioat_chan_common *chan = &ioat->base;
-       dma_addr_t phys_complete;
+       u64 phys_complete;
  
         spin_lock_bh(&chan->cleanup_lock);
-       if (ioat_cleanup_preamble(chan, &phys_complete))
+
+       if (ioat3_cleanup_preamble(chan, &phys_complete))
                 __cleanup(ioat, phys_complete);
+
+       if (is_ioat_halted(*chan->completion)) {
+               u32 chanerr = readl(chan->reg_base + IOAT_CHANERR_OFFSET);
+
+               if (chanerr & IOAT_CHANERR_HANDLE_MASK) {
+                       mod_timer(&chan->timer, jiffies + IDLE_TIMEOUT);
+                       ioat3_eh(ioat);
+               }
+       }
+
         spin_unlock_bh(&chan->cleanup_lock);
  }
  
@@ -333,15 +656,78 @@ static void ioat3_cleanup_event(unsigned long data)
  static void ioat3_restart_channel(struct ioat2_dma_chan *ioat)
  {
         struct ioat_chan_common *chan = &ioat->base;
-       dma_addr_t phys_complete;
+       u64 phys_complete;
  
         ioat2_quiesce(chan, 0);
-       if (ioat_cleanup_preamble(chan, &phys_complete))
+       if (ioat3_cleanup_preamble(chan, &phys_complete))
                 __cleanup(ioat, phys_complete);
  
         __ioat2_restart_chan(ioat);
  }
  
+static void ioat3_eh(struct ioat2_dma_chan *ioat)
+{
+       struct ioat_chan_common *chan = &ioat->base;
+       struct pci_dev *pdev = to_pdev(chan);
+       struct ioat_dma_descriptor *hw;
+       u64 phys_complete;
+       struct ioat_ring_ent *desc;
+       u32 err_handled = 0;
+       u32 chanerr_int;
+       u32 chanerr;
+
+       /* cleanup so tail points to descriptor that caused the error */
+       if (ioat3_cleanup_preamble(chan, &phys_complete))
+               __cleanup(ioat, phys_complete);
+
+       chanerr = readl(chan->reg_base + IOAT_CHANERR_OFFSET);
+       pci_read_config_dword(pdev, IOAT_PCI_CHANERR_INT_OFFSET, &chanerr_int);
+
+       dev_dbg(to_dev(chan), "%s: error = %x:%x\n",
+               __func__, chanerr, chanerr_int);
+
+       desc = ioat2_get_ring_ent(ioat, ioat->tail);
+       hw = desc->hw;
+       dump_desc_dbg(ioat, desc);
+
+       switch (hw->ctl_f.op) {
+       case IOAT_OP_XOR_VAL:
+               if (chanerr & IOAT_CHANERR_XOR_P_OR_CRC_ERR) {
+                       *desc->result |= SUM_CHECK_P_RESULT;
+                       err_handled |= IOAT_CHANERR_XOR_P_OR_CRC_ERR;
+               }
+               break;
+       case IOAT_OP_PQ_VAL:
+       case IOAT_OP_PQ_VAL_16S:
+               if (chanerr & IOAT_CHANERR_XOR_P_OR_CRC_ERR) {
+                       *desc->result |= SUM_CHECK_P_RESULT;
+                       err_handled |= IOAT_CHANERR_XOR_P_OR_CRC_ERR;
+               }
+               if (chanerr & IOAT_CHANERR_XOR_Q_ERR) {
+                       *desc->result |= SUM_CHECK_Q_RESULT;
+                       err_handled |= IOAT_CHANERR_XOR_Q_ERR;
+               }
+               break;
+       }
+
+       /* fault on unhandled error or spurious halt */
+       if (chanerr ^ err_handled || chanerr == 0) {
+               dev_err(to_dev(chan), "%s: fatal error (%x:%x)\n",
+                       __func__, chanerr, err_handled);
+               BUG();
+       }
+
+       writel(chanerr, chan->reg_base + IOAT_CHANERR_OFFSET);
+       pci_write_config_dword(pdev, IOAT_PCI_CHANERR_INT_OFFSET, chanerr_int);
+
+       /* mark faulting descriptor as complete */
+       *chan->completion = desc->txd.phys;
+
+       spin_lock_bh(&ioat->prep_lock);
+       ioat3_restart_channel(ioat);
+       spin_unlock_bh(&ioat->prep_lock);
+}
+
  static void check_active(struct ioat2_dma_chan *ioat)
  {
         struct ioat_chan_common *chan = &ioat->base;
@@ -605,7 +991,8 @@ dump_pq_desc_dbg(struct ioat2_dma_chan *ioat, struct ioat_ring_ent *desc, struct
         int i;
  
         dev_dbg(dev, "desc[%d]: (%#llx->%#llx) flags: %#x"
-               " sz: %#x ctl: %#x (op: %d int: %d compl: %d pq: '%s%s' src_cnt: %d)\n",
+               " sz: %#10.8x ctl: %#x (op: %#x int: %d compl: %d pq: '%s%s'"
+               " src_cnt: %d)\n",
                 desc_id(desc), (unsigned long long) desc->txd.phys,
                 (unsigned long long) (pq_ex ? pq_ex->next : pq->next),
                 desc->txd.flags, pq->size, pq->ctl, pq->ctl_f.op, pq->ctl_f.int_en,
@@ -617,6 +1004,42 @@ dump_pq_desc_dbg(struct ioat2_dma_chan *ioat, struct ioat_ring_ent *desc, struct
                         (unsigned long long) pq_get_src(descs, i), pq->coef[i]);
         dev_dbg(dev, "\tP: %#llx\n", pq->p_addr);
         dev_dbg(dev, "\tQ: %#llx\n", pq->q_addr);
+       dev_dbg(dev, "\tNEXT: %#llx\n", pq->next);
+}
+
+static void dump_pq16_desc_dbg(struct ioat2_dma_chan *ioat,
+                              struct ioat_ring_ent *desc)
+{
+       struct device *dev = to_dev(&ioat->base);
+       struct ioat_pq_descriptor *pq = desc->pq;
+       struct ioat_raw_descriptor *descs[] = { (void *)pq,
+                                               (void *)pq,
+                                               (void *)pq };
+       int src_cnt = src16_cnt_to_sw(pq->ctl_f.src_cnt);
+       int i;
+
+       if (desc->sed) {
+               descs[1] = (void *)desc->sed->hw;
+               descs[2] = (void *)desc->sed->hw + 64;
+       }
+
+       dev_dbg(dev, "desc[%d]: (%#llx->%#llx) flags: %#x"
+               " sz: %#x ctl: %#x (op: %#x int: %d compl: %d pq: '%s%s'"
+               " src_cnt: %d)\n",
+               desc_id(desc), (unsigned long long) desc->txd.phys,
+               (unsigned long long) pq->next,
+               desc->txd.flags, pq->size, pq->ctl,
+               pq->ctl_f.op, pq->ctl_f.int_en,
+               pq->ctl_f.compl_write,
+               pq->ctl_f.p_disable ? "" : "p", pq->ctl_f.q_disable ? "" : "q",
+               pq->ctl_f.src_cnt);
+       for (i = 0; i < src_cnt; i++) {
+               dev_dbg(dev, "\tsrc[%d]: %#llx coef: %#x\n", i,
+                       (unsigned long long) pq16_get_src(descs, i),
+                       pq->coef[i]);
+       }
+       dev_dbg(dev, "\tP: %#llx\n", pq->p_addr);
+       dev_dbg(dev, "\tQ: %#llx\n", pq->q_addr);
  }
  
  static struct dma_async_tx_descriptor *
@@ -627,6 +1050,7 @@ __ioat3_prep_pq_lock(struct dma_chan *c, enum sum_check_flags *result,
  {
         struct ioat2_dma_chan *ioat = to_ioat2_chan(c);
         struct ioat_chan_common *chan = &ioat->base;
+       struct ioatdma_device *device = chan->device;
         struct ioat_ring_ent *compl_desc;
         struct ioat_ring_ent *desc;
         struct ioat_ring_ent *ext;
@@ -637,6 +1061,7 @@ __ioat3_prep_pq_lock(struct dma_chan *c, enum sum_check_flags *result,
         u32 offset = 0;
         u8 op = result ? IOAT_OP_PQ_VAL : IOAT_OP_PQ;
         int i, s, idx, with_ext, num_descs;
+       int cb32 = (device->version < IOAT_VER_3_3) ? 1 : 0;
  
         dev_dbg(to_dev(chan), "%s\n", __func__);
         /* the engine requires at least two sources (we provide
@@ -662,7 +1087,7 @@ __ioat3_prep_pq_lock(struct dma_chan *c, enum sum_check_flags *result,
          * order.
          */
         if (likely(num_descs) &&
-           ioat2_check_space_lock(ioat, num_descs+1) == 0)
+           ioat2_check_space_lock(ioat, num_descs + cb32) == 0)
                 idx = ioat->head;
         else
                 return NULL;
@@ -700,6 +1125,9 @@ __ioat3_prep_pq_lock(struct dma_chan *c, enum sum_check_flags *result,
                 pq->q_addr = dst[1] + offset;
                 pq->ctl = 0;
                 pq->ctl_f.op = op;
+               /* we turn on descriptor write back error status */
+               if (device->cap & IOAT_CAP_DWBES)
+                       pq->ctl_f.wb_en = result ? 1 : 0;
                 pq->ctl_f.src_cnt = src_cnt_to_hw(s);
                 pq->ctl_f.p_disable = !!(flags & DMA_PREP_PQ_DISABLE_P);
                 pq->ctl_f.q_disable = !!(flags & DMA_PREP_PQ_DISABLE_Q);
@@ -716,26 +1144,140 @@ __ioat3_prep_pq_lock(struct dma_chan *c, enum sum_check_flags *result,
         pq->ctl_f.fence = !!(flags & DMA_PREP_FENCE);
         dump_pq_desc_dbg(ioat, desc, ext);
  
-       /* completion descriptor carries interrupt bit */
-       compl_desc = ioat2_get_ring_ent(ioat, idx + i);
-       compl_desc->txd.flags = flags & DMA_PREP_INTERRUPT;
-       hw = compl_desc->hw;
-       hw->ctl = 0;
-       hw->ctl_f.null = 1;
-       hw->ctl_f.int_en = !!(flags & DMA_PREP_INTERRUPT);
-       hw->ctl_f.compl_write = 1;
-       hw->size = NULL_DESC_BUFFER_SIZE;
-       dump_desc_dbg(ioat, compl_desc);
+       if (!cb32) {
+               pq->ctl_f.int_en = !!(flags & DMA_PREP_INTERRUPT);
+               pq->ctl_f.compl_write = 1;
+               compl_desc = desc;
+       } else {
+               /* completion descriptor carries interrupt bit */
+               compl_desc = ioat2_get_ring_ent(ioat, idx + i);
+               compl_desc->txd.flags = flags & DMA_PREP_INTERRUPT;
+               hw = compl_desc->hw;
+               hw->ctl = 0;
+               hw->ctl_f.null = 1;
+               hw->ctl_f.int_en = !!(flags & DMA_PREP_INTERRUPT);
+               hw->ctl_f.compl_write = 1;
+               hw->size = NULL_DESC_BUFFER_SIZE;
+               dump_desc_dbg(ioat, compl_desc);
+       }
+
  
         /* we leave the channel locked to ensure in order submission */
         return &compl_desc->txd;
  }
  
+static struct dma_async_tx_descriptor *
+__ioat3_prep_pq16_lock(struct dma_chan *c, enum sum_check_flags *result,
+                      const dma_addr_t *dst, const dma_addr_t *src,
+                      unsigned int src_cnt, const unsigned char *scf,
+                      size_t len, unsigned long flags)
+{
+       struct ioat2_dma_chan *ioat = to_ioat2_chan(c);
+       struct ioat_chan_common *chan = &ioat->base;
+       struct ioatdma_device *device = chan->device;
+       struct ioat_ring_ent *desc;
+       size_t total_len = len;
+       struct ioat_pq_descriptor *pq;
+       u32 offset = 0;
+       u8 op;
+       int i, s, idx, num_descs;
+
+       /* this function only handles src_cnt 9 - 16 */
+       BUG_ON(src_cnt < 9);
+
+       /* this function is only called with 9-16 sources */
+       op = result ? IOAT_OP_PQ_VAL_16S : IOAT_OP_PQ_16S;
+
+       dev_dbg(to_dev(chan), "%s\n", __func__);
+
+       num_descs = ioat2_xferlen_to_descs(ioat, len);
+
+       /*
+        * 16 source pq is only available on cb3.3 and has no completion
+        * write hw bug.
+        */
+       if (num_descs && ioat2_check_space_lock(ioat, num_descs) == 0)
+               idx = ioat->head;
+       else
+               return NULL;
+
+       i = 0;
+
+       do {
+               struct ioat_raw_descriptor *descs[4];
+               size_t xfer_size = min_t(size_t, len, 1 << ioat->xfercap_log);
+
+               desc = ioat2_get_ring_ent(ioat, idx + i);
+               pq = desc->pq;
+
+               descs[0] = (struct ioat_raw_descriptor *) pq;
+
+               desc->sed = ioat3_alloc_sed(device,
+                                           sed_get_pq16_pool_idx(src_cnt));
+               if (!desc->sed) {
+                       dev_err(to_dev(chan),
+                               "%s: no free sed entries\n", __func__);
+                       return NULL;
+               }
+
+               pq->sed_addr = desc->sed->dma;
+               desc->sed->parent = desc;
+
+               descs[1] = (struct ioat_raw_descriptor *)desc->sed->hw;
+               descs[2] = (void *)descs[1] + 64;
+
+               for (s = 0; s < src_cnt; s++)
+                       pq16_set_src(descs, src[s], offset, scf[s], s);
+
+               /* see the comment for dma_maxpq in include/linux/dmaengine.h */
+               if (dmaf_p_disabled_continue(flags))
+                       pq16_set_src(descs, dst[1], offset, 1, s++);
+               else if (dmaf_continue(flags)) {
+                       pq16_set_src(descs, dst[0], offset, 0, s++);
+                       pq16_set_src(descs, dst[1], offset, 1, s++);
+                       pq16_set_src(descs, dst[1], offset, 0, s++);
+               }
+
+               pq->size = xfer_size;
+               pq->p_addr = dst[0] + offset;
+               pq->q_addr = dst[1] + offset;
+               pq->ctl = 0;
+               pq->ctl_f.op = op;
+               pq->ctl_f.src_cnt = src16_cnt_to_hw(s);
+               /* we turn on descriptor write back error status */
+               if (device->cap & IOAT_CAP_DWBES)
+                       pq->ctl_f.wb_en = result ? 1 : 0;
+               pq->ctl_f.p_disable = !!(flags & DMA_PREP_PQ_DISABLE_P);
+               pq->ctl_f.q_disable = !!(flags & DMA_PREP_PQ_DISABLE_Q);
+
+               len -= xfer_size;
+               offset += xfer_size;
+       } while (++i < num_descs);
+
+       /* last pq descriptor carries the unmap parameters and fence bit */
+       desc->txd.flags = flags;
+       desc->len = total_len;
+       if (result)
+               desc->result = result;
+       pq->ctl_f.fence = !!(flags & DMA_PREP_FENCE);
+
+       /* with cb3.3 we should be able to do completion w/o a null desc */
+       pq->ctl_f.int_en = !!(flags & DMA_PREP_INTERRUPT);
+       pq->ctl_f.compl_write = 1;
+
+       dump_pq16_desc_dbg(ioat, desc);
+
+       /* we leave the channel locked to ensure in order submission */
+       return &desc->txd;
+}
+
  static struct dma_async_tx_descriptor *
  ioat3_prep_pq(struct dma_chan *chan, dma_addr_t *dst, dma_addr_t *src,
               unsigned int src_cnt, const unsigned char *scf, size_t len,
               unsigned long flags)
  {
+       struct dma_device *dma = chan->device;
+
         /* specify valid address for disabled result */
         if (flags & DMA_PREP_PQ_DISABLE_P)
                 dst[0] = dst[1];
@@ -755,11 +1297,20 @@ ioat3_prep_pq(struct dma_chan *chan, dma_addr_t *dst, dma_addr_t *src,
                 single_source_coef[0] = scf[0];
                 single_source_coef[1] = 0;
  
-               return __ioat3_prep_pq_lock(chan, NULL, dst, single_source, 2,
-                                           single_source_coef, len, flags);
-       } else
-               return __ioat3_prep_pq_lock(chan, NULL, dst, src, src_cnt, scf,
-                                           len, flags);
+               return (src_cnt > 8) && (dma->max_pq > 8) ?
+                       __ioat3_prep_pq16_lock(chan, NULL, dst, single_source,
+                                              2, single_source_coef, len,
+                                              flags) :
+                       __ioat3_prep_pq_lock(chan, NULL, dst, single_source, 2,
+                                            single_source_coef, len, flags);
+
+       } else {
+               return (src_cnt > 8) && (dma->max_pq > 8) ?
+                       __ioat3_prep_pq16_lock(chan, NULL, dst, src, src_cnt,
+                                              scf, len, flags) :
+                       __ioat3_prep_pq_lock(chan, NULL, dst, src, src_cnt,
+                                            scf, len, flags);
+       }
  }
  
  struct dma_async_tx_descriptor *
@@ -767,6 +1318,8 @@ ioat3_prep_pq_val(struct dma_chan *chan, dma_addr_t *pq, dma_addr_t *src,
                   unsigned int src_cnt, const unsigned char *scf, size_t len,
                   enum sum_check_flags *pqres, unsigned long flags)
  {
+       struct dma_device *dma = chan->device;
+
         /* specify valid address for disabled result */
         if (flags & DMA_PREP_PQ_DISABLE_P)
                 pq[0] = pq[1];
@@ -778,14 +1331,18 @@ ioat3_prep_pq_val(struct dma_chan *chan, dma_addr_t *pq, dma_addr_t *src,
          */
         *pqres = 0;
  
-       return __ioat3_prep_pq_lock(chan, pqres, pq, src, src_cnt, scf, len,
-                                   flags);
+       return (src_cnt > 8) && (dma->max_pq > 8) ?
+               __ioat3_prep_pq16_lock(chan, pqres, pq, src, src_cnt, scf, len,
+                                      flags) :
+               __ioat3_prep_pq_lock(chan, pqres, pq, src, src_cnt, scf, len,
+                                    flags);
  }
  
  static struct dma_async_tx_descriptor *
  ioat3_prep_pqxor(struct dma_chan *chan, dma_addr_t dst, dma_addr_t *src,
                  unsigned int src_cnt, size_t len, unsigned long flags)
  {
+       struct dma_device *dma = chan->device;
         unsigned char scf[src_cnt];
         dma_addr_t pq[2];
  
@@ -794,8 +1351,11 @@ ioat3_prep_pqxor(struct dma_chan *chan, dma_addr_t dst, dma_addr_t *src,
         flags |= DMA_PREP_PQ_DISABLE_Q;
         pq[1] = dst; /* specify valid address for disabled result */
  
-       return __ioat3_prep_pq_lock(chan, NULL, pq, src, src_cnt, scf, len,
-                                   flags);
+       return (src_cnt > 8) && (dma->max_pq > 8) ?
+               __ioat3_prep_pq16_lock(chan, NULL, pq, src, src_cnt, scf, len,
+                                      flags) :
+               __ioat3_prep_pq_lock(chan, NULL, pq, src, src_cnt, scf, len,
+                                    flags);
  }
  
  struct dma_async_tx_descriptor *
@@ -803,6 +1363,7 @@ ioat3_prep_pqxor_val(struct dma_chan *chan, dma_addr_t *src,
                      unsigned int src_cnt, size_t len,
                      enum sum_check_flags *result, unsigned long flags)
  {
+       struct dma_device *dma = chan->device;
         unsigned char scf[src_cnt];
         dma_addr_t pq[2];
  
@@ -816,8 +1377,12 @@ ioat3_prep_pqxor_val(struct dma_chan *chan, dma_addr_t *src,
         flags |= DMA_PREP_PQ_DISABLE_Q;
         pq[1] = pq[0]; /* specify valid address for disabled result */
  
-       return __ioat3_prep_pq_lock(chan, result, pq, &src[1], src_cnt - 1, scf,
-                                   len, flags);
+
+       return (src_cnt > 8) && (dma->max_pq > 8) ?
+               __ioat3_prep_pq16_lock(chan, result, pq, &src[1], src_cnt - 1,
+                                      scf, len, flags) :
+               __ioat3_prep_pq_lock(chan, result, pq, &src[1], src_cnt - 1,
+                                    scf, len, flags);
  }
  
  static struct dma_async_tx_descriptor *
@@ -1167,6 +1732,56 @@ static int ioat3_dma_self_test(struct ioatdma_device *device)
         return 0;
  }
  
+static int ioat3_irq_reinit(struct ioatdma_device *device)
+{
+       int msixcnt = device->common.chancnt;
+       struct pci_dev *pdev = device->pdev;
+       int i;
+       struct msix_entry *msix;
+       struct ioat_chan_common *chan;
+       int err = 0;
+
+       switch (device->irq_mode) {
+       case IOAT_MSIX:
+
+               for (i = 0; i < msixcnt; i++) {
+                       msix = &device->msix_entries[i];
+                       chan = ioat_chan_by_index(device, i);
+                       devm_free_irq(&pdev->dev, msix->vector, chan);
+               }
+
+               pci_disable_msix(pdev);
+               break;
+
+       case IOAT_MSIX_SINGLE:
+               msix = &device->msix_entries[0];
+               chan = ioat_chan_by_index(device, 0);
+               devm_free_irq(&pdev->dev, msix->vector, chan);
+               pci_disable_msix(pdev);
+               break;
+
+       case IOAT_MSI:
+               chan = ioat_chan_by_index(device, 0);
+               devm_free_irq(&pdev->dev, pdev->irq, chan);
+               pci_disable_msi(pdev);
+               break;
+
+       case IOAT_INTX:
+               chan = ioat_chan_by_index(device, 0);
+               devm_free_irq(&pdev->dev, pdev->irq, chan);
+               break;
+
+       default:
+               return 0;
+       }
+
+       device->irq_mode = IOAT_NOIRQ;
+
+       err = ioat_dma_setup_interrupts(device);
+
+       return err;
+}
+
  static int ioat3_reset_hw(struct ioat_chan_common *chan)
  {
         /* throw away whatever the channel was doing and get it
@@ -1183,80 +1798,65 @@ static int ioat3_reset_hw(struct ioat_chan_common *chan)
         chanerr = readl(chan->reg_base + IOAT_CHANERR_OFFSET);
         writel(chanerr, chan->reg_base + IOAT_CHANERR_OFFSET);
  
-       /* clear any pending errors */
-       err = pci_read_config_dword(pdev, IOAT_PCI_CHANERR_INT_OFFSET, &chanerr);
+       if (device->version < IOAT_VER_3_3) {
+               /* clear any pending errors */
+               err = pci_read_config_dword(pdev,
+                               IOAT_PCI_CHANERR_INT_OFFSET, &chanerr);
+               if (err) {
+                       dev_err(&pdev->dev,
+                               "channel error register unreachable\n");
+                       return err;
+               }
+               pci_write_config_dword(pdev,
+                               IOAT_PCI_CHANERR_INT_OFFSET, chanerr);
+
+               /* Clear DMAUNCERRSTS Cfg-Reg Parity Error status bit
+                * (workaround for spurious config parity error after restart)
+                */
+               pci_read_config_word(pdev, IOAT_PCI_DEVICE_ID_OFFSET, &dev_id);
+               if (dev_id == PCI_DEVICE_ID_INTEL_IOAT_TBG0) {
+                       pci_write_config_dword(pdev,
+                                              IOAT_PCI_DMAUNCERRSTS_OFFSET,
+                                              0x10);
+               }
+       }
+
+       err = ioat2_reset_sync(chan, msecs_to_jiffies(200));
         if (err) {
-               dev_err(&pdev->dev, "channel error register unreachable\n");
+               dev_err(&pdev->dev, "Failed to reset!\n");
                 return err;
         }
-       pci_write_config_dword(pdev, IOAT_PCI_CHANERR_INT_OFFSET, chanerr);
  
-       /* Clear DMAUNCERRSTS Cfg-Reg Parity Error status bit
-        * (workaround for spurious config parity error after restart)
-        */
-       pci_read_config_word(pdev, IOAT_PCI_DEVICE_ID_OFFSET, &dev_id);
-       if (dev_id == PCI_DEVICE_ID_INTEL_IOAT_TBG0)
-               pci_write_config_dword(pdev, IOAT_PCI_DMAUNCERRSTS_OFFSET, 0x10);
+       if (device->irq_mode != IOAT_NOIRQ && is_bwd_ioat(pdev))
+               err = ioat3_irq_reinit(device);
  
-       return ioat2_reset_sync(chan, msecs_to_jiffies(200));
+       return err;
  }
  
-static bool is_jf_ioat(struct pci_dev *pdev)
+static void ioat3_intr_quirk(struct ioatdma_device *device)
  {
-       switch (pdev->device) {
-       case PCI_DEVICE_ID_INTEL_IOAT_JSF0:
-       case PCI_DEVICE_ID_INTEL_IOAT_JSF1:
-       case PCI_DEVICE_ID_INTEL_IOAT_JSF2:
-       case PCI_DEVICE_ID_INTEL_IOAT_JSF3:
-       case PCI_DEVICE_ID_INTEL_IOAT_JSF4:
-       case PCI_DEVICE_ID_INTEL_IOAT_JSF5:
-       case PCI_DEVICE_ID_INTEL_IOAT_JSF6:
-       case PCI_DEVICE_ID_INTEL_IOAT_JSF7:
-       case PCI_DEVICE_ID_INTEL_IOAT_JSF8:
-       case PCI_DEVICE_ID_INTEL_IOAT_JSF9:
-               return true;
-       default:
-               return false;
-       }
-}
+       struct dma_device *dma;
+       struct dma_chan *c;
+       struct ioat_chan_common *chan;
+       u32 errmask;
  
-static bool is_snb_ioat(struct pci_dev *pdev)
-{
-       switch (pdev->device) {
-       case PCI_DEVICE_ID_INTEL_IOAT_SNB0:
-       case PCI_DEVICE_ID_INTEL_IOAT_SNB1:
-       case PCI_DEVICE_ID_INTEL_IOAT_SNB2:
-       case PCI_DEVICE_ID_INTEL_IOAT_SNB3:
-       case PCI_DEVICE_ID_INTEL_IOAT_SNB4:
-       case PCI_DEVICE_ID_INTEL_IOAT_SNB5:
-       case PCI_DEVICE_ID_INTEL_IOAT_SNB6:
-       case PCI_DEVICE_ID_INTEL_IOAT_SNB7:
-       case PCI_DEVICE_ID_INTEL_IOAT_SNB8:
-       case PCI_DEVICE_ID_INTEL_IOAT_SNB9:
-               return true;
-       default:
-               return false;
-       }
-}
+       dma = &device->common;
  
-static bool is_ivb_ioat(struct pci_dev *pdev)
-{
-       switch (pdev->device) {
-       case PCI_DEVICE_ID_INTEL_IOAT_IVB0:
-       case PCI_DEVICE_ID_INTEL_IOAT_IVB1:
-       case PCI_DEVICE_ID_INTEL_IOAT_IVB2:
-       case PCI_DEVICE_ID_INTEL_IOAT_IVB3:
-       case PCI_DEVICE_ID_INTEL_IOAT_IVB4:
-       case PCI_DEVICE_ID_INTEL_IOAT_IVB5:
-       case PCI_DEVICE_ID_INTEL_IOAT_IVB6:
-       case PCI_DEVICE_ID_INTEL_IOAT_IVB7:
-       case PCI_DEVICE_ID_INTEL_IOAT_IVB8:
-       case PCI_DEVICE_ID_INTEL_IOAT_IVB9:
-               return true;
-       default:
-               return false;
+       /*
+        * if we have descriptor write back error status, we mask the
+        * error interrupts
+        */
+       if (device->cap & IOAT_CAP_DWBES) {
+               list_for_each_entry(c, &dma->channels, device_node) {
+                       chan = to_chan_common(c);
+                       errmask = readl(chan->reg_base +
+                                       IOAT_CHANERR_MASK_OFFSET);
+                       errmask |= IOAT_CHANERR_XOR_P_OR_CRC_ERR |
+                                  IOAT_CHANERR_XOR_Q_ERR;
+                       writel(errmask, chan->reg_base +
+                                       IOAT_CHANERR_MASK_OFFSET);
+               }
         }
-
  }
  
  int ioat3_dma_probe(struct ioatdma_device *device, int dca)
@@ -1268,30 +1868,33 @@ int ioat3_dma_probe(struct ioatdma_device *device, int dca)
         struct ioat_chan_common *chan;
         bool is_raid_device = false;
         int err;
-       u32 cap;
  
         device->enumerate_channels = ioat2_enumerate_channels;
         device->reset_hw = ioat3_reset_hw;
         device->self_test = ioat3_dma_self_test;
+       device->intr_quirk = ioat3_intr_quirk;
         dma = &device->common;
         dma->device_prep_dma_memcpy = ioat2_dma_prep_memcpy_lock;
         dma->device_issue_pending = ioat2_issue_pending;
         dma->device_alloc_chan_resources = ioat2_alloc_chan_resources;
         dma->device_free_chan_resources = ioat2_free_chan_resources;
  
-       if (is_jf_ioat(pdev) || is_snb_ioat(pdev) || is_ivb_ioat(pdev))
+       if (is_xeon_cb32(pdev))
                 dma->copy_align = 6;
  
         dma_cap_set(DMA_INTERRUPT, dma->cap_mask);
         dma->device_prep_dma_interrupt = ioat3_prep_interrupt_lock;
  
-       cap = readl(device->reg_base + IOAT_DMA_CAP_OFFSET);
+       device->cap = readl(device->reg_base + IOAT_DMA_CAP_OFFSET);
+
+       if (is_bwd_noraid(pdev))
+               device->cap &= ~(IOAT_CAP_XOR | IOAT_CAP_PQ | IOAT_CAP_RAID16SS);
  
         /* dca is incompatible with raid operations */
-       if (dca_en && (cap & (IOAT_CAP_XOR|IOAT_CAP_PQ)))
-               cap &= ~(IOAT_CAP_XOR|IOAT_CAP_PQ);
+       if (dca_en && (device->cap & (IOAT_CAP_XOR|IOAT_CAP_PQ)))
+               device->cap &= ~(IOAT_CAP_XOR|IOAT_CAP_PQ);
  
-       if (cap & IOAT_CAP_XOR) {
+       if (device->cap & IOAT_CAP_XOR) {
                 is_raid_device = true;
                 dma->max_xor = 8;
                 dma->xor_align = 6;
@@ -1302,53 +1905,86 @@ int ioat3_dma_probe(struct ioatdma_device *device, int dca)
                 dma_cap_set(DMA_XOR_VAL, dma->cap_mask);
                 dma->device_prep_dma_xor_val = ioat3_prep_xor_val;
         }
-       if (cap & IOAT_CAP_PQ) {
+
+       if (device->cap & IOAT_CAP_PQ) {
                 is_raid_device = true;
-               dma_set_maxpq(dma, 8, 0);
-               dma->pq_align = 6;
  
-               dma_cap_set(DMA_PQ, dma->cap_mask);
                 dma->device_prep_dma_pq = ioat3_prep_pq;
-
-               dma_cap_set(DMA_PQ_VAL, dma->cap_mask);
                 dma->device_prep_dma_pq_val = ioat3_prep_pq_val;
+               dma_cap_set(DMA_PQ, dma->cap_mask);
+               dma_cap_set(DMA_PQ_VAL, dma->cap_mask);
  
-               if (!(cap & IOAT_CAP_XOR)) {
-                       dma->max_xor = 8;
-                       dma->xor_align = 6;
+               if (device->cap & IOAT_CAP_RAID16SS) {
+                       dma_set_maxpq(dma, 16, 0);
+                       dma->pq_align = 0;
+               } else {
+                       dma_set_maxpq(dma, 8, 0);
+                       if (is_xeon_cb32(pdev))
+                               dma->pq_align = 6;
+                       else
+                               dma->pq_align = 0;
+               }
  
-                       dma_cap_set(DMA_XOR, dma->cap_mask);
+               if (!(device->cap & IOAT_CAP_XOR)) {
                         dma->device_prep_dma_xor = ioat3_prep_pqxor;
-
-                       dma_cap_set(DMA_XOR_VAL, dma->cap_mask);
                         dma->device_prep_dma_xor_val = ioat3_prep_pqxor_val;
+                       dma_cap_set(DMA_XOR, dma->cap_mask);
+                       dma_cap_set(DMA_XOR_VAL, dma->cap_mask);
+
+                       if (device->cap & IOAT_CAP_RAID16SS) {
+                               dma->max_xor = 16;
+                               dma->xor_align = 0;
+                       } else {
+                               dma->max_xor = 8;
+                               if (is_xeon_cb32(pdev))
+                                       dma->xor_align = 6;
+                               else
+                                       dma->xor_align = 0;
+                       }
                 }
         }
-       if (is_raid_device && (cap & IOAT_CAP_FILL_BLOCK)) {
+
+       if (is_raid_device && (device->cap & IOAT_CAP_FILL_BLOCK)) {
                 dma_cap_set(DMA_MEMSET, dma->cap_mask);
                 dma->device_prep_dma_memset = ioat3_prep_memset_lock;
         }
  
  
-       if (is_raid_device) {
-               dma->device_tx_status = ioat3_tx_status;
-               device->cleanup_fn = ioat3_cleanup_event;
-               device->timer_fn = ioat3_timer_event;
-       } else {
-               dma->device_tx_status = ioat_dma_tx_status;
-               device->cleanup_fn = ioat2_cleanup_event;
-               device->timer_fn = ioat2_timer_event;
+       dma->device_tx_status = ioat3_tx_status;
+       device->cleanup_fn = ioat3_cleanup_event;
+       device->timer_fn = ioat3_timer_event;
+
+       if (is_xeon_cb32(pdev)) {
+               dma_cap_clear(DMA_XOR_VAL, dma->cap_mask);
+               dma->device_prep_dma_xor_val = NULL;
+
+               dma_cap_clear(DMA_PQ_VAL, dma->cap_mask);
+               dma->device_prep_dma_pq_val = NULL;
         }
  
-       #ifdef CONFIG_ASYNC_TX_DISABLE_PQ_VAL_DMA
-       dma_cap_clear(DMA_PQ_VAL, dma->cap_mask);
-       dma->device_prep_dma_pq_val = NULL;
-       #endif
+       /* starting with CB3.3 super extended descriptors are supported */
+       if (device->cap & IOAT_CAP_RAID16SS) {
+               char pool_name[14];
+               int i;
+
+               /* allocate sw descriptor pool for SED */
+               device->sed_pool = kmem_cache_create("ioat_sed",
+                               sizeof(struct ioat_sed_ent), 0, 0, NULL);
+               if (!device->sed_pool)
+                       return -ENOMEM;
+
+               for (i = 0; i < MAX_SED_POOLS; i++) {
+                       snprintf(pool_name, 14, "ioat_hw%d_sed", i);
  
-       #ifdef CONFIG_ASYNC_TX_DISABLE_XOR_VAL_DMA
-       dma_cap_clear(DMA_XOR_VAL, dma->cap_mask);
-       dma->device_prep_dma_xor_val = NULL;
-       #endif
+                       /* allocate SED DMA pool */
+                       device->sed_hw_pool[i] = dma_pool_create(pool_name,
+                                       &pdev->dev,
+                                       SED_SIZE * (i + 1), 64, 0);
+                       if (!device->sed_hw_pool[i])
+                               goto sed_pool_cleanup;
+
+               }
+       }
  
         err = ioat_probe(device);
         if (err)
@@ -1371,4 +2007,28 @@ int ioat3_dma_probe(struct ioatdma_device *device, int dca)
                 device->dca = ioat3_dca_init(pdev, device->reg_base);
  
         return 0;
+
+sed_pool_cleanup:
+       if (device->sed_pool) {
+               int i;
+               kmem_cache_destroy(device->sed_pool);
+
+               for (i = 0; i < MAX_SED_POOLS; i++)
+                       if (device->sed_hw_pool[i])
+                               dma_pool_destroy(device->sed_hw_pool[i]);
+       }
+
+       return -ENOMEM;
+}
+
+void ioat3_dma_remove(struct ioatdma_device *device)
+{
+       if (device->sed_pool) {
+               int i;
+               kmem_cache_destroy(device->sed_pool);
+
+               for (i = 0; i < MAX_SED_POOLS; i++)
+                       if (device->sed_hw_pool[i])
+                               dma_pool_destroy(device->sed_hw_pool[i]);
+       }
  }
diff --git a/drivers/dma/ioat/hw.h b/drivers/dma/ioat/hw.h

index 7cb74c62c7192f5bcae7d5fe6999eb17cb21dfd3..5ee57d402a6ef0361ef9317a3daeb225d95f75f7 100644 (file)
--- a/drivers/dma/ioat/hw.h
+++ b/drivers/dma/ioat/hw.h
@@ -30,11 +30,6 @@
  #define IOAT_PCI_DID_SCNB       0x65FF
  #define IOAT_PCI_DID_SNB        0x402F
  
-#define IOAT_VER_1_2            0x12    /* Version 1.2 */
-#define IOAT_VER_2_0            0x20    /* Version 2.0 */
-#define IOAT_VER_3_0            0x30    /* Version 3.0 */
-#define IOAT_VER_3_2            0x32    /* Version 3.2 */
-
  #define PCI_DEVICE_ID_INTEL_IOAT_IVB0  0x0e20
  #define PCI_DEVICE_ID_INTEL_IOAT_IVB1  0x0e21
  #define PCI_DEVICE_ID_INTEL_IOAT_IVB2  0x0e22
@@ -46,6 +41,29 @@
  #define PCI_DEVICE_ID_INTEL_IOAT_IVB8  0x0e2e
  #define PCI_DEVICE_ID_INTEL_IOAT_IVB9  0x0e2f
  
+#define PCI_DEVICE_ID_INTEL_IOAT_HSW0  0x2f20
+#define PCI_DEVICE_ID_INTEL_IOAT_HSW1  0x2f21
+#define PCI_DEVICE_ID_INTEL_IOAT_HSW2  0x2f22
+#define PCI_DEVICE_ID_INTEL_IOAT_HSW3  0x2f23
+#define PCI_DEVICE_ID_INTEL_IOAT_HSW4  0x2f24
+#define PCI_DEVICE_ID_INTEL_IOAT_HSW5  0x2f25
+#define PCI_DEVICE_ID_INTEL_IOAT_HSW6  0x2f26
+#define PCI_DEVICE_ID_INTEL_IOAT_HSW7  0x2f27
+#define PCI_DEVICE_ID_INTEL_IOAT_HSW8  0x2f2e
+#define PCI_DEVICE_ID_INTEL_IOAT_HSW9  0x2f2f
+
+#define PCI_DEVICE_ID_INTEL_IOAT_BWD0  0x0C50
+#define PCI_DEVICE_ID_INTEL_IOAT_BWD1  0x0C51
+#define PCI_DEVICE_ID_INTEL_IOAT_BWD2  0x0C52
+#define PCI_DEVICE_ID_INTEL_IOAT_BWD3  0x0C53
+
+#define IOAT_VER_1_2            0x12    /* Version 1.2 */
+#define IOAT_VER_2_0            0x20    /* Version 2.0 */
+#define IOAT_VER_3_0            0x30    /* Version 3.0 */
+#define IOAT_VER_3_2            0x32    /* Version 3.2 */
+#define IOAT_VER_3_3            0x33    /* Version 3.3 */
+
+
  int system_has_dca_enabled(struct pci_dev *pdev);
  
  struct ioat_dma_descriptor {
@@ -147,7 +165,17 @@ struct ioat_xor_ext_descriptor {
  };
  
  struct ioat_pq_descriptor {
-       uint32_t        size;
+       union {
+               uint32_t        size;
+               uint32_t        dwbes;
+               struct {
+                       unsigned int rsvd:25;
+                       unsigned int p_val_err:1;
+                       unsigned int q_val_err:1;
+                       unsigned int rsvd1:4;
+                       unsigned int wbes:1;
+               } dwbes_f;
+       };
         union {
                 uint32_t ctl;
                 struct {
@@ -162,9 +190,14 @@ struct ioat_pq_descriptor {
                         unsigned int hint:1;
                         unsigned int p_disable:1;
                         unsigned int q_disable:1;
-                       unsigned int rsvd:11;
+                       unsigned int rsvd2:2;
+                       unsigned int wb_en:1;
+                       unsigned int prl_en:1;
+                       unsigned int rsvd3:7;
                         #define IOAT_OP_PQ 0x89
                         #define IOAT_OP_PQ_VAL 0x8a
+                       #define IOAT_OP_PQ_16S 0xa0
+                       #define IOAT_OP_PQ_VAL_16S 0xa1
                         unsigned int op:8;
                 } ctl_f;
         };
@@ -172,7 +205,10 @@ struct ioat_pq_descriptor {
         uint64_t        p_addr;
         uint64_t        next;
         uint64_t        src_addr2;
-       uint64_t        src_addr3;
+       union {
+               uint64_t        src_addr3;
+               uint64_t        sed_addr;
+       };
         uint8_t         coef[8];
         uint64_t        q_addr;
  };
@@ -221,4 +257,40 @@ struct ioat_pq_update_descriptor {
  struct ioat_raw_descriptor {
         uint64_t        field[8];
  };
+
+struct ioat_pq16a_descriptor {
+       uint8_t coef[8];
+       uint64_t src_addr3;
+       uint64_t src_addr4;
+       uint64_t src_addr5;
+       uint64_t src_addr6;
+       uint64_t src_addr7;
+       uint64_t src_addr8;
+       uint64_t src_addr9;
+};
+
+struct ioat_pq16b_descriptor {
+       uint64_t src_addr10;
+       uint64_t src_addr11;
+       uint64_t src_addr12;
+       uint64_t src_addr13;
+       uint64_t src_addr14;
+       uint64_t src_addr15;
+       uint64_t src_addr16;
+       uint64_t rsvd;
+};
+
+union ioat_sed_pq_descriptor {
+       struct ioat_pq16a_descriptor a;
+       struct ioat_pq16b_descriptor b;
+};
+
+#define SED_SIZE       64
+
+struct ioat_sed_raw_descriptor {
+       uint64_t        a[8];
+       uint64_t        b[8];
+       uint64_t        c[8];
+};
+
  #endif
diff --git a/drivers/dma/ioat/pci.c b/drivers/dma/ioat/pci.c

index 71c7ecd80fac8d3b9b33dd909ba2533da17e2e83..2c8d560e6334123097627ab59ac166a47cb1f0d6 100644 (file)
--- a/drivers/dma/ioat/pci.c
+++ b/drivers/dma/ioat/pci.c
@@ -94,6 +94,23 @@ static struct pci_device_id ioat_pci_tbl[] = {
         { PCI_VDEVICE(INTEL, PCI_DEVICE_ID_INTEL_IOAT_IVB8) },
         { PCI_VDEVICE(INTEL, PCI_DEVICE_ID_INTEL_IOAT_IVB9) },
  
+       { PCI_VDEVICE(INTEL, PCI_DEVICE_ID_INTEL_IOAT_HSW0) },
+       { PCI_VDEVICE(INTEL, PCI_DEVICE_ID_INTEL_IOAT_HSW1) },
+       { PCI_VDEVICE(INTEL, PCI_DEVICE_ID_INTEL_IOAT_HSW2) },
+       { PCI_VDEVICE(INTEL, PCI_DEVICE_ID_INTEL_IOAT_HSW3) },
+       { PCI_VDEVICE(INTEL, PCI_DEVICE_ID_INTEL_IOAT_HSW4) },
+       { PCI_VDEVICE(INTEL, PCI_DEVICE_ID_INTEL_IOAT_HSW5) },
+       { PCI_VDEVICE(INTEL, PCI_DEVICE_ID_INTEL_IOAT_HSW6) },
+       { PCI_VDEVICE(INTEL, PCI_DEVICE_ID_INTEL_IOAT_HSW7) },
+       { PCI_VDEVICE(INTEL, PCI_DEVICE_ID_INTEL_IOAT_HSW8) },
+       { PCI_VDEVICE(INTEL, PCI_DEVICE_ID_INTEL_IOAT_HSW9) },
+
+       /* I/OAT v3.3 platforms */
+       { PCI_VDEVICE(INTEL, PCI_DEVICE_ID_INTEL_IOAT_BWD0) },
+       { PCI_VDEVICE(INTEL, PCI_DEVICE_ID_INTEL_IOAT_BWD1) },
+       { PCI_VDEVICE(INTEL, PCI_DEVICE_ID_INTEL_IOAT_BWD2) },
+       { PCI_VDEVICE(INTEL, PCI_DEVICE_ID_INTEL_IOAT_BWD3) },
+
         { 0, }
  };
  MODULE_DEVICE_TABLE(pci, ioat_pci_tbl);
@@ -190,6 +207,9 @@ static void ioat_remove(struct pci_dev *pdev)
         if (!device)
                 return;
  
+       if (device->version >= IOAT_VER_3_0)
+               ioat3_dma_remove(device);
+
         dev_err(&pdev->dev, "Removing dma and dca services\n");
         if (device->dca) {
                 unregister_dca_provider(device->dca, &pdev->dev);
diff --git a/drivers/dma/ioat/registers.h b/drivers/dma/ioat/registers.h

index 1391798542b66756b9ac0e811e64a683a65b77e8..2f1cfa0f1f475bfa7992c5daf944d093e7970232 100644 (file)
--- a/drivers/dma/ioat/registers.h
+++ b/drivers/dma/ioat/registers.h
@@ -79,6 +79,8 @@
  #define IOAT_CAP_APIC                          0x00000080
  #define IOAT_CAP_XOR                           0x00000100
  #define IOAT_CAP_PQ                            0x00000200
+#define IOAT_CAP_DWBES                         0x00002000
+#define IOAT_CAP_RAID16SS                      0x00020000
  
  #define IOAT_CHANNEL_MMIO_SIZE                 0x80    /* Each Channel MMIO space is this size */
  
@@ -93,6 +95,8 @@
  #define IOAT_CHANCTRL_ERR_COMPLETION_EN                0x0004
  #define IOAT_CHANCTRL_INT_REARM                        0x0001
  #define IOAT_CHANCTRL_RUN                      (IOAT_CHANCTRL_INT_REARM |\
+                                                IOAT_CHANCTRL_ERR_INT_EN |\
+                                                IOAT_CHANCTRL_ERR_COMPLETION_EN |\
                                                  IOAT_CHANCTRL_ANY_ERR_ABORT_EN)
  
  #define IOAT_DMA_COMP_OFFSET                   0x02    /* 16-bit DMA channel compatibility */
diff --git a/drivers/dma/ipu/ipu_idmac.c b/drivers/dma/ipu/ipu_idmac.c

index 8c61d17a86bf182b750d14df3b57f51708a2b326..d39c2cd0795d71437935d22ce90ca636c8fae0aa 100644 (file)
--- a/drivers/dma/ipu/ipu_idmac.c
+++ b/drivers/dma/ipu/ipu_idmac.c
@@ -1642,7 +1642,7 @@ static int __init ipu_idmac_init(struct ipu *ipu)
         return dma_async_device_register(&idmac->dma);
  }
  
-static void __exit ipu_idmac_exit(struct ipu *ipu)
+static void ipu_idmac_exit(struct ipu *ipu)
  {
         int i;
         struct idmac *idmac = &ipu->idmac;
@@ -1756,7 +1756,7 @@ err_noirq:
         return ret;
  }
  
-static int __exit ipu_remove(struct platform_device *pdev)
+static int ipu_remove(struct platform_device *pdev)
  {
         struct ipu *ipu = platform_get_drvdata(pdev);
  
@@ -1781,7 +1781,7 @@ static struct platform_driver ipu_platform_driver = {
                 .name   = "ipu-core",
                 .owner  = THIS_MODULE,
         },
-       .remove         = __exit_p(ipu_remove),
+       .remove         = ipu_remove,
  };
  
  static int __init ipu_init(void)
diff --git a/drivers/dma/of-dma.c b/drivers/dma/of-dma.c

index 69d04d28b1efbe78cb871b22af17bb851071e606..7aa0864cd487a4a084dcf3b326f9ef6ba91d3d86 100644 (file)
--- a/drivers/dma/of-dma.c
+++ b/drivers/dma/of-dma.c
@@ -13,43 +13,31 @@
  #include <linux/device.h>
  #include <linux/err.h>
  #include <linux/module.h>
-#include <linux/rculist.h>
+#include <linux/mutex.h>
  #include <linux/slab.h>
  #include <linux/of.h>
  #include <linux/of_dma.h>
  
  static LIST_HEAD(of_dma_list);
-static DEFINE_SPINLOCK(of_dma_lock);
+static DEFINE_MUTEX(of_dma_lock);
  
  /**
- * of_dma_get_controller - Get a DMA controller in DT DMA helpers list
+ * of_dma_find_controller - Get a DMA controller in DT DMA helpers list
   * @dma_spec:  pointer to DMA specifier as found in the device tree
   *
   * Finds a DMA controller with matching device node and number for dma cells
- * in a list of registered DMA controllers. If a match is found the use_count
- * variable is increased and a valid pointer to the DMA data stored is retuned.
- * A NULL pointer is returned if no match is found.
+ * in a list of registered DMA controllers. If a match is found a valid pointer
+ * to the DMA data stored is retuned. A NULL pointer is returned if no match is
+ * found.
   */
-static struct of_dma *of_dma_get_controller(struct of_phandle_args *dma_spec)
+static struct of_dma *of_dma_find_controller(struct of_phandle_args *dma_spec)
  {
         struct of_dma *ofdma;
  
-       spin_lock(&of_dma_lock);
-
-       if (list_empty(&of_dma_list)) {
-               spin_unlock(&of_dma_lock);
-               return NULL;
-       }
-
         list_for_each_entry(ofdma, &of_dma_list, of_dma_controllers)
                 if ((ofdma->of_node == dma_spec->np) &&
-                   (ofdma->of_dma_nbcells == dma_spec->args_count)) {
-                       ofdma->use_count++;
-                       spin_unlock(&of_dma_lock);
+                   (ofdma->of_dma_nbcells == dma_spec->args_count))
                         return ofdma;
-               }
-
-       spin_unlock(&of_dma_lock);
  
         pr_debug("%s: can't find DMA controller %s\n", __func__,
                  dma_spec->np->full_name);
@@ -57,22 +45,6 @@ static struct of_dma *of_dma_get_controller(struct of_phandle_args *dma_spec)
         return NULL;
  }
  
-/**
- * of_dma_put_controller - Decrement use count for a registered DMA controller
- * @of_dma:    pointer to DMA controller data
- *
- * Decrements the use_count variable in the DMA data structure. This function
- * should be called only when a valid pointer is returned from
- * of_dma_get_controller() and no further accesses to data referenced by that
- * pointer are needed.
- */
-static void of_dma_put_controller(struct of_dma *ofdma)
-{
-       spin_lock(&of_dma_lock);
-       ofdma->use_count--;
-       spin_unlock(&of_dma_lock);
-}
-
  /**
   * of_dma_controller_register - Register a DMA controller to DT DMA helpers
   * @np:                        device node of DMA controller
@@ -93,6 +65,7 @@ int of_dma_controller_register(struct device_node *np,
  {
         struct of_dma   *ofdma;
         int             nbcells;
+       const __be32    *prop;
  
         if (!np || !of_dma_xlate) {
                 pr_err("%s: not enough information provided\n", __func__);
@@ -103,8 +76,11 @@ int of_dma_controller_register(struct device_node *np,
         if (!ofdma)
                 return -ENOMEM;
  
-       nbcells = be32_to_cpup(of_get_property(np, "#dma-cells", NULL));
-       if (!nbcells) {
+       prop = of_get_property(np, "#dma-cells", NULL);
+       if (prop)
+               nbcells = be32_to_cpup(prop);
+
+       if (!prop || !nbcells) {
                 pr_err("%s: #dma-cells property is missing or invalid\n",
                        __func__);
                 kfree(ofdma);
@@ -115,12 +91,11 @@ int of_dma_controller_register(struct device_node *np,
         ofdma->of_dma_nbcells = nbcells;
         ofdma->of_dma_xlate = of_dma_xlate;
         ofdma->of_dma_data = data;
-       ofdma->use_count = 0;
  
         /* Now queue of_dma controller structure in list */
-       spin_lock(&of_dma_lock);
+       mutex_lock(&of_dma_lock);
         list_add_tail(&ofdma->of_dma_controllers, &of_dma_list);
-       spin_unlock(&of_dma_lock);
+       mutex_unlock(&of_dma_lock);
  
         return 0;
  }
@@ -132,32 +107,20 @@ EXPORT_SYMBOL_GPL(of_dma_controller_register);
   *
   * Memory allocated by of_dma_controller_register() is freed here.
   */
-int of_dma_controller_free(struct device_node *np)
+void of_dma_controller_free(struct device_node *np)
  {
         struct of_dma *ofdma;
  
-       spin_lock(&of_dma_lock);
-
-       if (list_empty(&of_dma_list)) {
-               spin_unlock(&of_dma_lock);
-               return -ENODEV;
-       }
+       mutex_lock(&of_dma_lock);
  
         list_for_each_entry(ofdma, &of_dma_list, of_dma_controllers)
                 if (ofdma->of_node == np) {
-                       if (ofdma->use_count) {
-                               spin_unlock(&of_dma_lock);
-                               return -EBUSY;
-                       }
-
                         list_del(&ofdma->of_dma_controllers);
-                       spin_unlock(&of_dma_lock);
                         kfree(ofdma);
-                       return 0;
+                       break;
                 }
  
-       spin_unlock(&of_dma_lock);
-       return -ENODEV;
+       mutex_unlock(&of_dma_lock);
  }
  EXPORT_SYMBOL_GPL(of_dma_controller_free);
  
@@ -172,8 +135,8 @@ EXPORT_SYMBOL_GPL(of_dma_controller_free);
   * specifiers, matches the name provided. Returns 0 if the name matches and
   * a valid pointer to the DMA specifier is found. Otherwise returns -ENODEV.
   */
-static int of_dma_match_channel(struct device_node *np, char *name, int index,
-                               struct of_phandle_args *dma_spec)
+static int of_dma_match_channel(struct device_node *np, const char *name,
+                               int index, struct of_phandle_args *dma_spec)
  {
         const char *s;
  
@@ -198,7 +161,7 @@ static int of_dma_match_channel(struct device_node *np, char *name, int index,
   * Returns pointer to appropriate dma channel on success or NULL on error.
   */
  struct dma_chan *of_dma_request_slave_channel(struct device_node *np,
-                                             char *name)
+                                             const char *name)
  {
         struct of_phandle_args  dma_spec;
         struct of_dma           *ofdma;
@@ -220,14 +183,15 @@ struct dma_chan *of_dma_request_slave_channel(struct device_node *np,
                 if (of_dma_match_channel(np, name, i, &dma_spec))
                         continue;
  
-               ofdma = of_dma_get_controller(&dma_spec);
-
-               if (!ofdma)
-                       continue;
+               mutex_lock(&of_dma_lock);
+               ofdma = of_dma_find_controller(&dma_spec);
  
-               chan = ofdma->of_dma_xlate(&dma_spec, ofdma);
+               if (ofdma)
+                       chan = ofdma->of_dma_xlate(&dma_spec, ofdma);
+               else
+                       chan = NULL;
  
-               of_dma_put_controller(ofdma);
+               mutex_unlock(&of_dma_lock);
  
                 of_node_put(dma_spec.np);
  
diff --git a/drivers/dma/omap-dma.c b/drivers/dma/omap-dma.c

index 08b43bf3715816951d23cc71dd49ca9b810b55f1..ec3fc4fd9160e8aeddf16054cd405f35b43bfef7 100644 (file)
--- a/drivers/dma/omap-dma.c
+++ b/drivers/dma/omap-dma.c
@@ -16,6 +16,8 @@
  #include <linux/platform_device.h>
  #include <linux/slab.h>
  #include <linux/spinlock.h>
+#include <linux/of_dma.h>
+#include <linux/of_device.h>
  
  #include "virt-dma.h"
  
@@ -67,6 +69,10 @@ static const unsigned es_bytes[] = {
         [OMAP_DMA_DATA_TYPE_S32] = 4,
  };
  
+static struct of_dma_filter_info omap_dma_info = {
+       .filter_fn = omap_dma_filter_fn,
+};
+
  static inline struct omap_dmadev *to_omap_dma_dev(struct dma_device *d)
  {
         return container_of(d, struct omap_dmadev, ddev);
@@ -629,8 +635,22 @@ static int omap_dma_probe(struct platform_device *pdev)
                 pr_warn("OMAP-DMA: failed to register slave DMA engine device: %d\n",
                         rc);
                 omap_dma_free(od);
-       } else {
-               platform_set_drvdata(pdev, od);
+               return rc;
+       }
+
+       platform_set_drvdata(pdev, od);
+
+       if (pdev->dev.of_node) {
+               omap_dma_info.dma_cap = od->ddev.cap_mask;
+
+               /* Device-tree DMA controller registration */
+               rc = of_dma_controller_register(pdev->dev.of_node,
+                               of_dma_simple_xlate, &omap_dma_info);
+               if (rc) {
+                       pr_warn("OMAP-DMA: failed to register DMA controller\n");
+                       dma_async_device_unregister(&od->ddev);
+                       omap_dma_free(od);
+               }
         }
  
         dev_info(&pdev->dev, "OMAP DMA engine driver\n");
@@ -642,18 +662,32 @@ static int omap_dma_remove(struct platform_device *pdev)
  {
         struct omap_dmadev *od = platform_get_drvdata(pdev);
  
+       if (pdev->dev.of_node)
+               of_dma_controller_free(pdev->dev.of_node);
+
         dma_async_device_unregister(&od->ddev);
         omap_dma_free(od);
  
         return 0;
  }
  
+static const struct of_device_id omap_dma_match[] = {
+       { .compatible = "ti,omap2420-sdma", },
+       { .compatible = "ti,omap2430-sdma", },
+       { .compatible = "ti,omap3430-sdma", },
+       { .compatible = "ti,omap3630-sdma", },
+       { .compatible = "ti,omap4430-sdma", },
+       {},
+};
+MODULE_DEVICE_TABLE(of, omap_dma_match);
+
  static struct platform_driver omap_dma_driver = {
         .probe  = omap_dma_probe,
         .remove = omap_dma_remove,
         .driver = {
                 .name = "omap-dma-engine",
                 .owner = THIS_MODULE,
+               .of_match_table = of_match_ptr(omap_dma_match),
         },
  };
  
diff --git a/drivers/dma/pch_dma.c b/drivers/dma/pch_dma.c

index d01faeb0f27c15ee9a55e78d257caf58136ff8d9..ce3dc3e9688c86ea30e3be7757094b78749a6ec9 100644 (file)
--- a/drivers/dma/pch_dma.c
+++ b/drivers/dma/pch_dma.c
@@ -476,7 +476,7 @@ static struct pch_dma_desc *pdc_desc_get(struct pch_dma_chan *pd_chan)
         dev_dbg(chan2dev(&pd_chan->chan), "scanned %d descriptors\n", i);
  
         if (!ret) {
-               ret = pdc_alloc_desc(&pd_chan->chan, GFP_NOIO);
+               ret = pdc_alloc_desc(&pd_chan->chan, GFP_ATOMIC);
                 if (ret) {
                         spin_lock(&pd_chan->lock);
                         pd_chan->descs_allocated++;
diff --git a/drivers/dma/pl330.c b/drivers/dma/pl330.c

index 5dbc5946c4c3d9931585ac2440f64274f477fe02..a17553f7c02809325b06830d6eb1e36a4c95e435 100644 (file)
--- a/drivers/dma/pl330.c
+++ b/drivers/dma/pl330.c
@@ -26,6 +26,7 @@
  #include <linux/scatterlist.h>
  #include <linux/of.h>
  #include <linux/of_dma.h>
+#include <linux/err.h>
  
  #include "dmaengine.h"
  #define PL330_MAX_CHAN         8
@@ -2288,13 +2289,12 @@ static inline void fill_queue(struct dma_pl330_chan *pch)
  
                 /* If already submitted */
                 if (desc->status == BUSY)
-                       break;
+                       continue;
  
                 ret = pl330_submit_req(pch->pl330_chid,
                                                 &desc->req);
                 if (!ret) {
                         desc->status = BUSY;
-                       break;
                 } else if (ret == -EAGAIN) {
                         /* QFull or DMAC Dying */
                         break;
@@ -2904,9 +2904,9 @@ pl330_probe(struct amba_device *adev, const struct amba_id *id)
         pi->mcbufsz = pdat ? pdat->mcbuf_sz : 0;
  
         res = &adev->res;
-       pi->base = devm_request_and_ioremap(&adev->dev, res);
-       if (!pi->base)
-               return -ENXIO;
+       pi->base = devm_ioremap_resource(&adev->dev, res);
+       if (IS_ERR(pi->base))
+               return PTR_ERR(pi->base);
  
         amba_set_drvdata(adev, pdmac);
  
diff --git a/drivers/dma/sh/Kconfig b/drivers/dma/sh/Kconfig

new file mode 100644 (file)

index 0000000..5c1dee2
--- /dev/null
+++ b/drivers/dma/sh/Kconfig
@@ -0,0 +1,24 @@
+#
+# DMA engine configuration for sh
+#
+
+config SH_DMAE_BASE
+       bool "Renesas SuperH DMA Engine support"
+       depends on (SUPERH && SH_DMA) || (ARM && ARCH_SHMOBILE)
+       depends on !SH_DMA_API
+       default y
+       select DMA_ENGINE
+       help
+         Enable support for the Renesas SuperH DMA controllers.
+
+config SH_DMAE
+       tristate "Renesas SuperH DMAC support"
+       depends on SH_DMAE_BASE
+       help
+         Enable support for the Renesas SuperH DMA controllers.
+
+config SUDMAC
+       tristate "Renesas SUDMAC support"
+       depends on SH_DMAE_BASE
+       help
+         Enable support for the Renesas SUDMAC controllers.
diff --git a/drivers/dma/sh/Makefile b/drivers/dma/sh/Makefile

index 54ae9572b0ac1f2735d7a8fe3e47adb4c87a6373..c07ca4612e460ef4b20c9869ed78f87edc8f6926 100644 (file)
--- a/drivers/dma/sh/Makefile
+++ b/drivers/dma/sh/Makefile
@@ -1,2 +1,3 @@
-obj-$(CONFIG_SH_DMAE) += shdma-base.o
+obj-$(CONFIG_SH_DMAE_BASE) += shdma-base.o
  obj-$(CONFIG_SH_DMAE) += shdma.o
+obj-$(CONFIG_SUDMAC) += sudmac.o
diff --git a/drivers/dma/sh/sudmac.c b/drivers/dma/sh/sudmac.c

new file mode 100644 (file)

index 0000000..e7c94bb
--- /dev/null
+++ b/drivers/dma/sh/sudmac.c
@@ -0,0 +1,428 @@
+/*
+ * Renesas SUDMAC support
+ *
+ * Copyright (C) 2013 Renesas Solutions Corp.
+ *
+ * based on drivers/dma/sh/shdma.c:
+ * Copyright (C) 2011-2012 Guennadi Liakhovetski <g.liakhovetski@gmx.de>
+ * Copyright (C) 2009 Nobuhiro Iwamatsu <iwamatsu.nobuhiro@renesas.com>
+ * Copyright (C) 2009 Renesas Solutions, Inc. All rights reserved.
+ * Copyright (C) 2007 Freescale Semiconductor, Inc. All rights reserved.
+ *
+ * This is free software; you can redistribute it and/or modify
+ * it under the terms of version 2 of the GNU General Public License as
+ * published by the Free Software Foundation.
+ */
+
+#include <linux/init.h>
+#include <linux/module.h>
+#include <linux/slab.h>
+#include <linux/interrupt.h>
+#include <linux/dmaengine.h>
+#include <linux/platform_device.h>
+#include <linux/sudmac.h>
+
+struct sudmac_chan {
+       struct shdma_chan shdma_chan;
+       void __iomem *base;
+       char dev_id[16];        /* unique name per DMAC of channel */
+
+       u32 offset;             /* for CFG, BA, BBC, CA, CBC, DEN */
+       u32 cfg;
+       u32 dint_end_bit;
+};
+
+struct sudmac_device {
+       struct shdma_dev shdma_dev;
+       struct sudmac_pdata *pdata;
+       void __iomem *chan_reg;
+};
+
+struct sudmac_regs {
+       u32 base_addr;
+       u32 base_byte_count;
+};
+
+struct sudmac_desc {
+       struct sudmac_regs hw;
+       struct shdma_desc shdma_desc;
+};
+
+#define to_chan(schan) container_of(schan, struct sudmac_chan, shdma_chan)
+#define to_desc(sdesc) container_of(sdesc, struct sudmac_desc, shdma_desc)
+#define to_sdev(sc) container_of(sc->shdma_chan.dma_chan.device, \
+                                struct sudmac_device, shdma_dev.dma_dev)
+
+/* SUDMAC register */
+#define SUDMAC_CH0CFG          0x00
+#define SUDMAC_CH0BA           0x10
+#define SUDMAC_CH0BBC          0x18
+#define SUDMAC_CH0CA           0x20
+#define SUDMAC_CH0CBC          0x28
+#define SUDMAC_CH0DEN          0x30
+#define SUDMAC_DSTSCLR         0x38
+#define SUDMAC_DBUFCTRL                0x3C
+#define SUDMAC_DINTCTRL                0x40
+#define SUDMAC_DINTSTS         0x44
+#define SUDMAC_DINTSTSCLR      0x48
+#define SUDMAC_CH0SHCTRL       0x50
+
+/* Definitions for the sudmac_channel.config */
+#define SUDMAC_SENDBUFM        0x1000 /* b12: Transmit Buffer Mode */
+#define SUDMAC_RCVENDM 0x0100 /* b8: Receive Data Transfer End Mode */
+#define SUDMAC_LBA_WAIT        0x0030 /* b5-4: Local Bus Access Wait */
+
+/* Definitions for the sudmac_channel.dint_end_bit */
+#define SUDMAC_CH1ENDE 0x0002 /* b1: Ch1 DMA Transfer End Int Enable */
+#define SUDMAC_CH0ENDE 0x0001 /* b0: Ch0 DMA Transfer End Int Enable */
+
+#define SUDMAC_DRV_NAME "sudmac"
+
+static void sudmac_writel(struct sudmac_chan *sc, u32 data, u32 reg)
+{
+       iowrite32(data, sc->base + reg);
+}
+
+static u32 sudmac_readl(struct sudmac_chan *sc, u32 reg)
+{
+       return ioread32(sc->base + reg);
+}
+
+static bool sudmac_is_busy(struct sudmac_chan *sc)
+{
+       u32 den = sudmac_readl(sc, SUDMAC_CH0DEN + sc->offset);
+
+       if (den)
+               return true; /* working */
+
+       return false; /* waiting */
+}
+
+static void sudmac_set_reg(struct sudmac_chan *sc, struct sudmac_regs *hw,
+                          struct shdma_desc *sdesc)
+{
+       sudmac_writel(sc, sc->cfg, SUDMAC_CH0CFG + sc->offset);
+       sudmac_writel(sc, hw->base_addr, SUDMAC_CH0BA + sc->offset);
+       sudmac_writel(sc, hw->base_byte_count, SUDMAC_CH0BBC + sc->offset);
+}
+
+static void sudmac_start(struct sudmac_chan *sc)
+{
+       u32 dintctrl = sudmac_readl(sc, SUDMAC_DINTCTRL);
+
+       sudmac_writel(sc, dintctrl | sc->dint_end_bit, SUDMAC_DINTCTRL);
+       sudmac_writel(sc, 1, SUDMAC_CH0DEN + sc->offset);
+}
+
+static void sudmac_start_xfer(struct shdma_chan *schan,
+                             struct shdma_desc *sdesc)
+{
+       struct sudmac_chan *sc = to_chan(schan);
+       struct sudmac_desc *sd = to_desc(sdesc);
+
+       sudmac_set_reg(sc, &sd->hw, sdesc);
+       sudmac_start(sc);
+}
+
+static bool sudmac_channel_busy(struct shdma_chan *schan)
+{
+       struct sudmac_chan *sc = to_chan(schan);
+
+       return sudmac_is_busy(sc);
+}
+
+static void sudmac_setup_xfer(struct shdma_chan *schan, int slave_id)
+{
+}
+
+static const struct sudmac_slave_config *sudmac_find_slave(
+       struct sudmac_chan *sc, int slave_id)
+{
+       struct sudmac_device *sdev = to_sdev(sc);
+       struct sudmac_pdata *pdata = sdev->pdata;
+       const struct sudmac_slave_config *cfg;
+       int i;
+
+       for (i = 0, cfg = pdata->slave; i < pdata->slave_num; i++, cfg++)
+               if (cfg->slave_id == slave_id)
+                       return cfg;
+
+       return NULL;
+}
+
+static int sudmac_set_slave(struct shdma_chan *schan, int slave_id, bool try)
+{
+       struct sudmac_chan *sc = to_chan(schan);
+       const struct sudmac_slave_config *cfg = sudmac_find_slave(sc, slave_id);
+
+       if (!cfg)
+               return -ENODEV;
+
+       return 0;
+}
+
+static inline void sudmac_dma_halt(struct sudmac_chan *sc)
+{
+       u32 dintctrl = sudmac_readl(sc, SUDMAC_DINTCTRL);
+
+       sudmac_writel(sc, 0, SUDMAC_CH0DEN + sc->offset);
+       sudmac_writel(sc, dintctrl & ~sc->dint_end_bit, SUDMAC_DINTCTRL);
+       sudmac_writel(sc, sc->dint_end_bit, SUDMAC_DINTSTSCLR);
+}
+
+static int sudmac_desc_setup(struct shdma_chan *schan,
+                            struct shdma_desc *sdesc,
+                            dma_addr_t src, dma_addr_t dst, size_t *len)
+{
+       struct sudmac_chan *sc = to_chan(schan);
+       struct sudmac_desc *sd = to_desc(sdesc);
+
+       dev_dbg(sc->shdma_chan.dev, "%s: src=%x, dst=%x, len=%d\n",
+               __func__, src, dst, *len);
+
+       if (*len > schan->max_xfer_len)
+               *len = schan->max_xfer_len;
+
+       if (dst)
+               sd->hw.base_addr = dst;
+       else if (src)
+               sd->hw.base_addr = src;
+       sd->hw.base_byte_count = *len;
+
+       return 0;
+}
+
+static void sudmac_halt(struct shdma_chan *schan)
+{
+       struct sudmac_chan *sc = to_chan(schan);
+
+       sudmac_dma_halt(sc);
+}
+
+static bool sudmac_chan_irq(struct shdma_chan *schan, int irq)
+{
+       struct sudmac_chan *sc = to_chan(schan);
+       u32 dintsts = sudmac_readl(sc, SUDMAC_DINTSTS);
+
+       if (!(dintsts & sc->dint_end_bit))
+               return false;
+
+       /* DMA stop */
+       sudmac_dma_halt(sc);
+
+       return true;
+}
+
+static size_t sudmac_get_partial(struct shdma_chan *schan,
+                                struct shdma_desc *sdesc)
+{
+       struct sudmac_chan *sc = to_chan(schan);
+       struct sudmac_desc *sd = to_desc(sdesc);
+       u32 current_byte_count = sudmac_readl(sc, SUDMAC_CH0CBC + sc->offset);
+
+       return sd->hw.base_byte_count - current_byte_count;
+}
+
+static bool sudmac_desc_completed(struct shdma_chan *schan,
+                                 struct shdma_desc *sdesc)
+{
+       struct sudmac_chan *sc = to_chan(schan);
+       struct sudmac_desc *sd = to_desc(sdesc);
+       u32 current_addr = sudmac_readl(sc, SUDMAC_CH0CA + sc->offset);
+
+       return sd->hw.base_addr + sd->hw.base_byte_count == current_addr;
+}
+
+static int sudmac_chan_probe(struct sudmac_device *su_dev, int id, int irq,
+                            unsigned long flags)
+{
+       struct shdma_dev *sdev = &su_dev->shdma_dev;
+       struct platform_device *pdev = to_platform_device(sdev->dma_dev.dev);
+       struct sudmac_chan *sc;
+       struct shdma_chan *schan;
+       int err;
+
+       sc = devm_kzalloc(&pdev->dev, sizeof(struct sudmac_chan), GFP_KERNEL);
+       if (!sc) {
+               dev_err(sdev->dma_dev.dev,
+                       "No free memory for allocating dma channels!\n");
+               return -ENOMEM;
+       }
+
+       schan = &sc->shdma_chan;
+       schan->max_xfer_len = 64 * 1024 * 1024 - 1;
+
+       shdma_chan_probe(sdev, schan, id);
+
+       sc->base = su_dev->chan_reg;
+
+       /* get platform_data */
+       sc->offset = su_dev->pdata->channel->offset;
+       if (su_dev->pdata->channel->config & SUDMAC_TX_BUFFER_MODE)
+               sc->cfg |= SUDMAC_SENDBUFM;
+       if (su_dev->pdata->channel->config & SUDMAC_RX_END_MODE)
+               sc->cfg |= SUDMAC_RCVENDM;
+       sc->cfg |= (su_dev->pdata->channel->wait << 4) & SUDMAC_LBA_WAIT;
+
+       if (su_dev->pdata->channel->dint_end_bit & SUDMAC_DMA_BIT_CH0)
+               sc->dint_end_bit |= SUDMAC_CH0ENDE;
+       if (su_dev->pdata->channel->dint_end_bit & SUDMAC_DMA_BIT_CH1)
+               sc->dint_end_bit |= SUDMAC_CH1ENDE;
+
+       /* set up channel irq */
+       if (pdev->id >= 0)
+               snprintf(sc->dev_id, sizeof(sc->dev_id), "sudmac%d.%d",
+                        pdev->id, id);
+       else
+               snprintf(sc->dev_id, sizeof(sc->dev_id), "sudmac%d", id);
+
+       err = shdma_request_irq(schan, irq, flags, sc->dev_id);
+       if (err) {
+               dev_err(sdev->dma_dev.dev,
+                       "DMA channel %d request_irq failed %d\n", id, err);
+               goto err_no_irq;
+       }
+
+       return 0;
+
+err_no_irq:
+       /* remove from dmaengine device node */
+       shdma_chan_remove(schan);
+       return err;
+}
+
+static void sudmac_chan_remove(struct sudmac_device *su_dev)
+{
+       struct dma_device *dma_dev = &su_dev->shdma_dev.dma_dev;
+       struct shdma_chan *schan;
+       int i;
+
+       shdma_for_each_chan(schan, &su_dev->shdma_dev, i) {
+               struct sudmac_chan *sc = to_chan(schan);
+
+               BUG_ON(!schan);
+
+               shdma_free_irq(&sc->shdma_chan);
+               shdma_chan_remove(schan);
+       }
+       dma_dev->chancnt = 0;
+}
+
+static dma_addr_t sudmac_slave_addr(struct shdma_chan *schan)
+{
+       /* SUDMAC doesn't need the address */
+       return 0;
+}
+
+static struct shdma_desc *sudmac_embedded_desc(void *buf, int i)
+{
+       return &((struct sudmac_desc *)buf)[i].shdma_desc;
+}
+
+static const struct shdma_ops sudmac_shdma_ops = {
+       .desc_completed = sudmac_desc_completed,
+       .halt_channel = sudmac_halt,
+       .channel_busy = sudmac_channel_busy,
+       .slave_addr = sudmac_slave_addr,
+       .desc_setup = sudmac_desc_setup,
+       .set_slave = sudmac_set_slave,
+       .setup_xfer = sudmac_setup_xfer,
+       .start_xfer = sudmac_start_xfer,
+       .embedded_desc = sudmac_embedded_desc,
+       .chan_irq = sudmac_chan_irq,
+       .get_partial = sudmac_get_partial,
+};
+
+static int sudmac_probe(struct platform_device *pdev)
+{
+       struct sudmac_pdata *pdata = pdev->dev.platform_data;
+       int err, i;
+       struct sudmac_device *su_dev;
+       struct dma_device *dma_dev;
+       struct resource *chan, *irq_res;
+
+       /* get platform data */
+       if (!pdata)
+               return -ENODEV;
+
+       chan = platform_get_resource(pdev, IORESOURCE_MEM, 0);
+       irq_res = platform_get_resource(pdev, IORESOURCE_IRQ, 0);
+       if (!chan || !irq_res)
+               return -ENODEV;
+
+       err = -ENOMEM;
+       su_dev = devm_kzalloc(&pdev->dev, sizeof(struct sudmac_device),
+                             GFP_KERNEL);
+       if (!su_dev) {
+               dev_err(&pdev->dev, "Not enough memory\n");
+               return err;
+       }
+
+       dma_dev = &su_dev->shdma_dev.dma_dev;
+
+       su_dev->chan_reg = devm_request_and_ioremap(&pdev->dev, chan);
+       if (!su_dev->chan_reg)
+               return err;
+
+       dma_cap_set(DMA_SLAVE, dma_dev->cap_mask);
+
+       su_dev->shdma_dev.ops = &sudmac_shdma_ops;
+       su_dev->shdma_dev.desc_size = sizeof(struct sudmac_desc);
+       err = shdma_init(&pdev->dev, &su_dev->shdma_dev, pdata->channel_num);
+       if (err < 0)
+               return err;
+
+       /* platform data */
+       su_dev->pdata = pdev->dev.platform_data;
+
+       platform_set_drvdata(pdev, su_dev);
+
+       /* Create DMA Channel */
+       for (i = 0; i < pdata->channel_num; i++) {
+               err = sudmac_chan_probe(su_dev, i, irq_res->start, IRQF_SHARED);
+               if (err)
+                       goto chan_probe_err;
+       }
+
+       err = dma_async_device_register(&su_dev->shdma_dev.dma_dev);
+       if (err < 0)
+               goto chan_probe_err;
+
+       return err;
+
+chan_probe_err:
+       sudmac_chan_remove(su_dev);
+
+       platform_set_drvdata(pdev, NULL);
+       shdma_cleanup(&su_dev->shdma_dev);
+
+       return err;
+}
+
+static int sudmac_remove(struct platform_device *pdev)
+{
+       struct sudmac_device *su_dev = platform_get_drvdata(pdev);
+       struct dma_device *dma_dev = &su_dev->shdma_dev.dma_dev;
+
+       dma_async_device_unregister(dma_dev);
+       sudmac_chan_remove(su_dev);
+       shdma_cleanup(&su_dev->shdma_dev);
+       platform_set_drvdata(pdev, NULL);
+
+       return 0;
+}
+
+static struct platform_driver sudmac_driver = {
+       .driver         = {
+               .owner  = THIS_MODULE,
+               .name   = SUDMAC_DRV_NAME,
+       },
+       .probe          = sudmac_probe,
+       .remove         = sudmac_remove,
+};
+module_platform_driver(sudmac_driver);
+
+MODULE_AUTHOR("Yoshihiro Shimoda");
+MODULE_DESCRIPTION("Renesas SUDMAC driver");
+MODULE_LICENSE("GPL v2");
+MODULE_ALIAS("platform:" SUDMAC_DRV_NAME);
diff --git a/drivers/dma/sirf-dma.c b/drivers/dma/sirf-dma.c

index 1d627e2391f495ef2b8b9fd2e6b5359da940df2d..1765a0a2736d0d729041dc5dc7dbb110f1d1c5e4 100644 (file)
--- a/drivers/dma/sirf-dma.c
+++ b/drivers/dma/sirf-dma.c
@@ -16,6 +16,7 @@
  #include <linux/of_address.h>
  #include <linux/of_device.h>
  #include <linux/of_platform.h>
+#include <linux/clk.h>
  #include <linux/sirfsoc_dma.h>
  
  #include "dmaengine.h"
@@ -78,6 +79,7 @@ struct sirfsoc_dma {
         struct sirfsoc_dma_chan         channels[SIRFSOC_DMA_CHANNELS];
         void __iomem                    *base;
         int                             irq;
+       struct clk                      *clk;
         bool                            is_marco;
  };
  
@@ -639,6 +641,12 @@ static int sirfsoc_dma_probe(struct platform_device *op)
                 return -EINVAL;
         }
  
+       sdma->clk = devm_clk_get(dev, NULL);
+       if (IS_ERR(sdma->clk)) {
+               dev_err(dev, "failed to get a clock.\n");
+               return PTR_ERR(sdma->clk);
+       }
+
         ret = of_address_to_resource(dn, 0, &res);
         if (ret) {
                 dev_err(dev, "Error parsing memory region!\n");
@@ -698,6 +706,8 @@ static int sirfsoc_dma_probe(struct platform_device *op)
  
         tasklet_init(&sdma->tasklet, sirfsoc_dma_tasklet, (unsigned long)sdma);
  
+       clk_prepare_enable(sdma->clk);
+
         /* Register DMA engine */
         dev_set_drvdata(dev, sdma);
         ret = dma_async_device_register(dma);
@@ -720,6 +730,7 @@ static int sirfsoc_dma_remove(struct platform_device *op)
         struct device *dev = &op->dev;
         struct sirfsoc_dma *sdma = dev_get_drvdata(dev);
  
+       clk_disable_unprepare(sdma->clk);
         dma_async_device_unregister(&sdma->dma);
         free_irq(sdma->irq, sdma);
         irq_dispose_mapping(sdma->irq);
@@ -742,7 +753,18 @@ static struct platform_driver sirfsoc_dma_driver = {
         },
  };
  
-module_platform_driver(sirfsoc_dma_driver);
+static __init int sirfsoc_dma_init(void)
+{
+       return platform_driver_register(&sirfsoc_dma_driver);
+}
+
+static void __exit sirfsoc_dma_exit(void)
+{
+       platform_driver_unregister(&sirfsoc_dma_driver);
+}
+
+subsys_initcall(sirfsoc_dma_init);
+module_exit(sirfsoc_dma_exit);
  
  MODULE_AUTHOR("Rongjun Ying <rongjun.ying@csr.com>, "
         "Barry Song <baohua.song@csr.com>");
diff --git a/drivers/dma/tegra20-apb-dma.c b/drivers/dma/tegra20-apb-dma.c

index fcee27eae1f6d35d9635a009e3fe991fc87a0eeb..ce193409ebd32345e997dad798625c41565b438e 100644 (file)
--- a/drivers/dma/tegra20-apb-dma.c
+++ b/drivers/dma/tegra20-apb-dma.c
@@ -30,6 +30,7 @@
  #include <linux/of.h>
  #include <linux/of_device.h>
  #include <linux/platform_device.h>
+#include <linux/pm.h>
  #include <linux/pm_runtime.h>
  #include <linux/slab.h>
  #include <linux/clk/tegra.h>
@@ -199,6 +200,7 @@ struct tegra_dma_channel {
  
         /* Channel-slave specific configuration */
         struct dma_slave_config dma_sconfig;
+       struct tegra_dma_channel_regs   channel_reg;
  };
  
  /* tegra_dma: Tegra DMA specific information */
@@ -1213,7 +1215,6 @@ static const struct tegra_dma_chip_data tegra20_dma_chip_data = {
         .support_channel_pause  = false,
  };
  
-#if defined(CONFIG_OF)
  /* Tegra30 specific DMA controller information */
  static const struct tegra_dma_chip_data tegra30_dma_chip_data = {
         .nr_channels            = 32,
@@ -1243,7 +1244,6 @@ static const struct of_device_id tegra_dma_of_match[] = {
         },
  };
  MODULE_DEVICE_TABLE(of, tegra_dma_of_match);
-#endif
  
  static int tegra_dma_probe(struct platform_device *pdev)
  {
@@ -1252,20 +1252,14 @@ static int tegra_dma_probe(struct platform_device *pdev)
         int ret;
         int i;
         const struct tegra_dma_chip_data *cdata = NULL;
+       const struct of_device_id *match;
  
-       if (pdev->dev.of_node) {
-               const struct of_device_id *match;
-               match = of_match_device(of_match_ptr(tegra_dma_of_match),
-                                       &pdev->dev);
-               if (!match) {
-                       dev_err(&pdev->dev, "Error: No device match found\n");
-                       return -ENODEV;
-               }
-               cdata = match->data;
-       } else {
-               /* If no device tree then fallback to tegra20 */
-               cdata = &tegra20_dma_chip_data;
+       match = of_match_device(tegra_dma_of_match, &pdev->dev);
+       if (!match) {
+               dev_err(&pdev->dev, "Error: No device match found\n");
+               return -ENODEV;
         }
+       cdata = match->data;
  
         tdma = devm_kzalloc(&pdev->dev, sizeof(*tdma) + cdata->nr_channels *
                         sizeof(struct tegra_dma_channel), GFP_KERNEL);
@@ -1448,11 +1442,74 @@ static int tegra_dma_runtime_resume(struct device *dev)
         return 0;
  }
  
+#ifdef CONFIG_PM_SLEEP
+static int tegra_dma_pm_suspend(struct device *dev)
+{
+       struct tegra_dma *tdma = dev_get_drvdata(dev);
+       int i;
+       int ret;
+
+       /* Enable clock before accessing register */
+       ret = tegra_dma_runtime_resume(dev);
+       if (ret < 0)
+               return ret;
+
+       tdma->reg_gen = tdma_read(tdma, TEGRA_APBDMA_GENERAL);
+       for (i = 0; i < tdma->chip_data->nr_channels; i++) {
+               struct tegra_dma_channel *tdc = &tdma->channels[i];
+               struct tegra_dma_channel_regs *ch_reg = &tdc->channel_reg;
+
+               ch_reg->csr = tdc_read(tdc, TEGRA_APBDMA_CHAN_CSR);
+               ch_reg->ahb_ptr = tdc_read(tdc, TEGRA_APBDMA_CHAN_AHBPTR);
+               ch_reg->apb_ptr = tdc_read(tdc, TEGRA_APBDMA_CHAN_APBPTR);
+               ch_reg->ahb_seq = tdc_read(tdc, TEGRA_APBDMA_CHAN_AHBSEQ);
+               ch_reg->apb_seq = tdc_read(tdc, TEGRA_APBDMA_CHAN_APBSEQ);
+       }
+
+       /* Disable clock */
+       tegra_dma_runtime_suspend(dev);
+       return 0;
+}
+
+static int tegra_dma_pm_resume(struct device *dev)
+{
+       struct tegra_dma *tdma = dev_get_drvdata(dev);
+       int i;
+       int ret;
+
+       /* Enable clock before accessing register */
+       ret = tegra_dma_runtime_resume(dev);
+       if (ret < 0)
+               return ret;
+
+       tdma_write(tdma, TEGRA_APBDMA_GENERAL, tdma->reg_gen);
+       tdma_write(tdma, TEGRA_APBDMA_CONTROL, 0);
+       tdma_write(tdma, TEGRA_APBDMA_IRQ_MASK_SET, 0xFFFFFFFFul);
+
+       for (i = 0; i < tdma->chip_data->nr_channels; i++) {
+               struct tegra_dma_channel *tdc = &tdma->channels[i];
+               struct tegra_dma_channel_regs *ch_reg = &tdc->channel_reg;
+
+               tdc_write(tdc, TEGRA_APBDMA_CHAN_APBSEQ, ch_reg->apb_seq);
+               tdc_write(tdc, TEGRA_APBDMA_CHAN_APBPTR, ch_reg->apb_ptr);
+               tdc_write(tdc, TEGRA_APBDMA_CHAN_AHBSEQ, ch_reg->ahb_seq);
+               tdc_write(tdc, TEGRA_APBDMA_CHAN_AHBPTR, ch_reg->ahb_ptr);
+               tdc_write(tdc, TEGRA_APBDMA_CHAN_CSR,
+                       (ch_reg->csr & ~TEGRA_APBDMA_CSR_ENB));
+       }
+
+       /* Disable clock */
+       tegra_dma_runtime_suspend(dev);
+       return 0;
+}
+#endif
+
  static const struct dev_pm_ops tegra_dma_dev_pm_ops = {
  #ifdef CONFIG_PM_RUNTIME
         .runtime_suspend = tegra_dma_runtime_suspend,
         .runtime_resume = tegra_dma_runtime_resume,
  #endif
+       SET_SYSTEM_SLEEP_PM_OPS(tegra_dma_pm_suspend, tegra_dma_pm_resume)
  };
  
  static struct platform_driver tegra_dmac_driver = {
@@ -1460,7 +1517,7 @@ static struct platform_driver tegra_dmac_driver = {
                 .name   = "tegra-apbdma",
                 .owner = THIS_MODULE,
                 .pm     = &tegra_dma_dev_pm_ops,
-               .of_match_table = of_match_ptr(tegra_dma_of_match),
+               .of_match_table = tegra_dma_of_match,
         },
         .probe          = tegra_dma_probe,
         .remove         = tegra_dma_remove,
diff --git a/drivers/dma/timb_dma.c b/drivers/dma/timb_dma.c

index 952f823901a6cb5d168ce0b90f2429383ed5bfc3..26107ba6edb33a2fb7d9165aa4ef874bf02f8fbe 100644 (file)
--- a/drivers/dma/timb_dma.c
+++ b/drivers/dma/timb_dma.c
@@ -823,7 +823,7 @@ static struct platform_driver td_driver = {
                 .owner  = THIS_MODULE,
         },
         .probe  = td_probe,
-       .remove = __exit_p(td_remove),
+       .remove = td_remove,
  };
  
  module_platform_driver(td_driver);
diff --git a/drivers/dma/txx9dmac.c b/drivers/dma/txx9dmac.c

index 913f55c76c9915bfb08bde041afdc2d9a7d7590f..a59fb4841d4c18283eae911c076c43dc042f0748 100644 (file)
--- a/drivers/dma/txx9dmac.c
+++ b/drivers/dma/txx9dmac.c
@@ -1190,7 +1190,7 @@ static int __init txx9dmac_chan_probe(struct platform_device *pdev)
         return 0;
  }
  
-static int __exit txx9dmac_chan_remove(struct platform_device *pdev)
+static int txx9dmac_chan_remove(struct platform_device *pdev)
  {
         struct txx9dmac_chan *dc = platform_get_drvdata(pdev);
  
@@ -1252,7 +1252,7 @@ static int __init txx9dmac_probe(struct platform_device *pdev)
         return 0;
  }
  
-static int __exit txx9dmac_remove(struct platform_device *pdev)
+static int txx9dmac_remove(struct platform_device *pdev)
  {
         struct txx9dmac_dev *ddev = platform_get_drvdata(pdev);
  
@@ -1299,14 +1299,14 @@ static const struct dev_pm_ops txx9dmac_dev_pm_ops = {
  };
  
  static struct platform_driver txx9dmac_chan_driver = {
-       .remove         = __exit_p(txx9dmac_chan_remove),
+       .remove         = txx9dmac_chan_remove,
         .driver = {
                 .name   = "txx9dmac-chan",
         },
  };
  
  static struct platform_driver txx9dmac_driver = {
-       .remove         = __exit_p(txx9dmac_remove),
+       .remove         = txx9dmac_remove,
         .shutdown       = txx9dmac_shutdown,
         .driver = {
                 .name   = "txx9dmac",
diff --git a/drivers/edac/edac_mc_sysfs.c b/drivers/edac/edac_mc_sysfs.c

index 5899a76eec3bd9086d1edfc24fa1b4aca1ce7969..67610a6ebf875765682cb50bec4226b42f956fc3 100644 (file)
--- a/drivers/edac/edac_mc_sysfs.c
+++ b/drivers/edac/edac_mc_sysfs.c
@@ -87,7 +87,7 @@ static struct device *mci_pdev;
  /*
   * various constants for Memory Controllers
   */
-static const char *mem_types[] = {
+static const char * const mem_types[] = {
         [MEM_EMPTY] = "Empty",
         [MEM_RESERVED] = "Reserved",
         [MEM_UNKNOWN] = "Unknown",
@@ -107,7 +107,7 @@ static const char *mem_types[] = {
         [MEM_RDDR3] = "Registered-DDR3"
  };
  
-static const char *dev_types[] = {
+static const char * const dev_types[] = {
         [DEV_UNKNOWN] = "Unknown",
         [DEV_X1] = "x1",
         [DEV_X2] = "x2",
@@ -118,7 +118,7 @@ static const char *dev_types[] = {
         [DEV_X64] = "x64"
  };
  
-static const char *edac_caps[] = {
+static const char * const edac_caps[] = {
         [EDAC_UNKNOWN] = "Unknown",
         [EDAC_NONE] = "None",
         [EDAC_RESERVED] = "Reserved",
@@ -327,17 +327,17 @@ static struct device_attribute *dynamic_csrow_dimm_attr[] = {
  };
  
  /* possible dynamic channel ce_count attribute files */
-DEVICE_CHANNEL(ch0_ce_count, S_IRUGO | S_IWUSR,
+DEVICE_CHANNEL(ch0_ce_count, S_IRUGO,
                    channel_ce_count_show, NULL, 0);
-DEVICE_CHANNEL(ch1_ce_count, S_IRUGO | S_IWUSR,
+DEVICE_CHANNEL(ch1_ce_count, S_IRUGO,
                    channel_ce_count_show, NULL, 1);
-DEVICE_CHANNEL(ch2_ce_count, S_IRUGO | S_IWUSR,
+DEVICE_CHANNEL(ch2_ce_count, S_IRUGO,
                    channel_ce_count_show, NULL, 2);
-DEVICE_CHANNEL(ch3_ce_count, S_IRUGO | S_IWUSR,
+DEVICE_CHANNEL(ch3_ce_count, S_IRUGO,
                    channel_ce_count_show, NULL, 3);
-DEVICE_CHANNEL(ch4_ce_count, S_IRUGO | S_IWUSR,
+DEVICE_CHANNEL(ch4_ce_count, S_IRUGO,
                    channel_ce_count_show, NULL, 4);
-DEVICE_CHANNEL(ch5_ce_count, S_IRUGO | S_IWUSR,
+DEVICE_CHANNEL(ch5_ce_count, S_IRUGO,
                    channel_ce_count_show, NULL, 5);
  
  /* Total possible dynamic ce_count attribute file table */
diff --git a/drivers/extcon/Kconfig b/drivers/extcon/Kconfig

index 5168a1324a65743da44b1e4deff557419cf1ab9e..3297301a42d4089b74ad4498b456aa495702a810 100644 (file)
--- a/drivers/extcon/Kconfig
+++ b/drivers/extcon/Kconfig
@@ -16,7 +16,7 @@ comment "Extcon Device Drivers"
  
  config EXTCON_GPIO
         tristate "GPIO extcon support"
-       depends on GENERIC_GPIO
+       depends on GPIOLIB
         help
           Say Y here to enable GPIO based extcon support. Note that GPIO
           extcon supports single state per extcon instance.
diff --git a/drivers/firewire/core-cdev.c b/drivers/firewire/core-cdev.c

index 27ac423ab25e26e3a78defb81fcdad21b1050f9d..7ef316fdc4d964cc8d1f87bd7be0cb9995223612 100644 (file)
--- a/drivers/firewire/core-cdev.c
+++ b/drivers/firewire/core-cdev.c
@@ -389,10 +389,8 @@ static void queue_bus_reset_event(struct client *client)
         struct bus_reset_event *e;
  
         e = kzalloc(sizeof(*e), GFP_KERNEL);
-       if (e == NULL) {
-               fw_notice(client->device->card, "out of memory when allocating event\n");
+       if (e == NULL)
                 return;
-       }
  
         fill_bus_reset_event(&e->reset, client);
  
@@ -693,10 +691,9 @@ static void handle_request(struct fw_card *card, struct fw_request *request,
  
         r = kmalloc(sizeof(*r), GFP_ATOMIC);
         e = kmalloc(sizeof(*e), GFP_ATOMIC);
-       if (r == NULL || e == NULL) {
-               fw_notice(card, "out of memory when allocating event\n");
+       if (r == NULL || e == NULL)
                 goto failed;
-       }
+
         r->card    = card;
         r->request = request;
         r->data    = payload;
@@ -930,10 +927,9 @@ static void iso_callback(struct fw_iso_context *context, u32 cycle,
         struct iso_interrupt_event *e;
  
         e = kmalloc(sizeof(*e) + header_length, GFP_ATOMIC);
-       if (e == NULL) {
-               fw_notice(context->card, "out of memory when allocating event\n");
+       if (e == NULL)
                 return;
-       }
+
         e->interrupt.type      = FW_CDEV_EVENT_ISO_INTERRUPT;
         e->interrupt.closure   = client->iso_closure;
         e->interrupt.cycle     = cycle;
@@ -950,10 +946,9 @@ static void iso_mc_callback(struct fw_iso_context *context,
         struct iso_interrupt_mc_event *e;
  
         e = kmalloc(sizeof(*e), GFP_ATOMIC);
-       if (e == NULL) {
-               fw_notice(context->card, "out of memory when allocating event\n");
+       if (e == NULL)
                 return;
-       }
+
         e->interrupt.type      = FW_CDEV_EVENT_ISO_INTERRUPT_MULTICHANNEL;
         e->interrupt.closure   = client->iso_closure;
         e->interrupt.completed = fw_iso_buffer_lookup(&client->buffer,
@@ -1366,8 +1361,7 @@ static int init_iso_resource(struct client *client,
         int ret;
  
         if ((request->channels == 0 && request->bandwidth == 0) ||
-           request->bandwidth > BANDWIDTH_AVAILABLE_INITIAL ||
-           request->bandwidth < 0)
+           request->bandwidth > BANDWIDTH_AVAILABLE_INITIAL)
                 return -EINVAL;
  
         r  = kmalloc(sizeof(*r), GFP_KERNEL);
@@ -1582,10 +1576,9 @@ void fw_cdev_handle_phy_packet(struct fw_card *card, struct fw_packet *p)
  
         list_for_each_entry(client, &card->phy_receiver_list, phy_receiver_link) {
                 e = kmalloc(sizeof(*e) + 8, GFP_ATOMIC);
-               if (e == NULL) {
-                       fw_notice(card, "out of memory when allocating event\n");
+               if (e == NULL)
                         break;
-               }
+
                 e->phy_packet.closure   = client->phy_receiver_closure;
                 e->phy_packet.type      = FW_CDEV_EVENT_PHY_PACKET_RECEIVED;
                 e->phy_packet.rcode     = RCODE_COMPLETE;
diff --git a/drivers/firewire/core-device.c b/drivers/firewire/core-device.c

index 03ce7d980c6ac4d83015e83731d032d778d482aa..664a6ff0a82363b9846ed00679e8b434e765d4d2 100644 (file)
--- a/drivers/firewire/core-device.c
+++ b/drivers/firewire/core-device.c
@@ -692,10 +692,8 @@ static void create_units(struct fw_device *device)
                  * match the drivers id_tables against it.
                  */
                 unit = kzalloc(sizeof(*unit), GFP_KERNEL);
-               if (unit == NULL) {
-                       fw_err(device->card, "out of memory for unit\n");
+               if (unit == NULL)
                         continue;
-               }
  
                 unit->directory = ci.p + value - 1;
                 unit->device.bus = &fw_bus_type;
diff --git a/drivers/firewire/net.c b/drivers/firewire/net.c

index 4d565365e476c1f7bdef79070fe66b7cb17a0457..815b0fcbe918e92225248fede7329e6c13cc8dde 100644 (file)
--- a/drivers/firewire/net.c
+++ b/drivers/firewire/net.c
@@ -356,10 +356,8 @@ static struct fwnet_fragment_info *fwnet_frag_new(
         }
  
         new = kmalloc(sizeof(*new), GFP_ATOMIC);
-       if (!new) {
-               dev_err(&pd->skb->dev->dev, "out of memory\n");
+       if (!new)
                 return NULL;
-       }
  
         new->offset = offset;
         new->len = len;
@@ -402,8 +400,6 @@ fail_w_fi:
  fail_w_new:
         kfree(new);
  fail:
-       dev_err(&net->dev, "out of memory\n");
-
         return NULL;
  }
  
@@ -609,7 +605,6 @@ static int fwnet_incoming_packet(struct fwnet_device *dev, __be32 *buf, int len,
  
                 skb = dev_alloc_skb(len + LL_RESERVED_SPACE(net));
                 if (unlikely(!skb)) {
-                       dev_err(&net->dev, "out of memory\n");
                         net->stats.rx_dropped++;
  
                         return -ENOMEM;
diff --git a/drivers/firewire/ohci.c b/drivers/firewire/ohci.c

index 45912e6e0ac2e59b9b6a1b9001d984b9d2760ec0..9e1db6490b9a3bb497b7911d94c42d9fc6190be7 100644 (file)
--- a/drivers/firewire/ohci.c
+++ b/drivers/firewire/ohci.c
@@ -54,6 +54,10 @@
  #include "core.h"
  #include "ohci.h"
  
+#define ohci_info(ohci, f, args...)    dev_info(ohci->card.device, f, ##args)
+#define ohci_notice(ohci, f, args...)  dev_notice(ohci->card.device, f, ##args)
+#define ohci_err(ohci, f, args...)     dev_err(ohci->card.device, f, ##args)
+
  #define DESCRIPTOR_OUTPUT_MORE         0
  #define DESCRIPTOR_OUTPUT_LAST         (1 << 12)
  #define DESCRIPTOR_INPUT_MORE          (2 << 12)
@@ -68,6 +72,8 @@
  #define DESCRIPTOR_BRANCH_ALWAYS       (3 << 2)
  #define DESCRIPTOR_WAIT                        (3 << 0)
  
+#define DESCRIPTOR_CMD                 (0xf << 12)
+
  struct descriptor {
         __le16 req_count;
         __le16 control;
@@ -149,10 +155,11 @@ struct context {
         struct descriptor *last;
  
         /*
-        * The last descriptor in the DMA program.  It contains the branch
+        * The last descriptor block in the DMA program. It contains the branch
          * address that must be updated upon appending a new descriptor.
          */
         struct descriptor *prev;
+       int prev_z;
  
         descriptor_callback_t callback;
  
@@ -270,7 +277,9 @@ static char ohci_driver_name[] = KBUILD_MODNAME;
  #define PCI_DEVICE_ID_TI_TSB12LV22     0x8009
  #define PCI_DEVICE_ID_TI_TSB12LV26     0x8020
  #define PCI_DEVICE_ID_TI_TSB82AA2      0x8025
+#define PCI_DEVICE_ID_VIA_VT630X       0x3044
  #define PCI_VENDOR_ID_PINNACLE_SYSTEMS 0x11bd
+#define PCI_REV_ID_VIA_VT6306          0x46
  
  #define QUIRK_CYCLE_TIMER              1
  #define QUIRK_RESET_PACKET             2
@@ -278,6 +287,8 @@ static char ohci_driver_name[] = KBUILD_MODNAME;
  #define QUIRK_NO_1394A                 8
  #define QUIRK_NO_MSI                   16
  #define QUIRK_TI_SLLZ059               32
+#define QUIRK_IR_WAKE                  64
+#define QUIRK_PHY_LCTRL_TIMEOUT                128
  
  /* In case of multiple matches in ohci_quirks[], only the first one is used. */
  static const struct {
@@ -290,7 +301,10 @@ static const struct {
                 QUIRK_BE_HEADERS},
  
         {PCI_VENDOR_ID_ATT, PCI_DEVICE_ID_AGERE_FW643, 6,
-               QUIRK_NO_MSI},
+               QUIRK_PHY_LCTRL_TIMEOUT | QUIRK_NO_MSI},
+
+       {PCI_VENDOR_ID_ATT, PCI_ANY_ID, PCI_ANY_ID,
+               QUIRK_PHY_LCTRL_TIMEOUT},
  
         {PCI_VENDOR_ID_CREATIVE, PCI_DEVICE_ID_CREATIVE_SB1394, PCI_ANY_ID,
                 QUIRK_RESET_PACKET},
@@ -319,6 +333,9 @@ static const struct {
         {PCI_VENDOR_ID_TI, PCI_ANY_ID, PCI_ANY_ID,
                 QUIRK_RESET_PACKET},
  
+       {PCI_VENDOR_ID_VIA, PCI_DEVICE_ID_VIA_VT630X, PCI_REV_ID_VIA_VT6306,
+               QUIRK_CYCLE_TIMER | QUIRK_IR_WAKE},
+
         {PCI_VENDOR_ID_VIA, PCI_ANY_ID, PCI_ANY_ID,
                 QUIRK_CYCLE_TIMER | QUIRK_NO_MSI},
  };
@@ -333,6 +350,8 @@ MODULE_PARM_DESC(quirks, "Chip quirks (default = 0"
         ", no 1394a enhancements = "    __stringify(QUIRK_NO_1394A)
         ", disable MSI = "              __stringify(QUIRK_NO_MSI)
         ", TI SLLZ059 erratum = "       __stringify(QUIRK_TI_SLLZ059)
+       ", IR wake unreliable = "       __stringify(QUIRK_IR_WAKE)
+       ", phy LCtrl timeout = "        __stringify(QUIRK_PHY_LCTRL_TIMEOUT)
         ")");
  
  #define OHCI_PARAM_DEBUG_AT_AR         1
@@ -359,8 +378,7 @@ static void log_irqs(struct fw_ohci *ohci, u32 evt)
             !(evt & OHCI1394_busReset))
                 return;
  
-       dev_notice(ohci->card.device,
-           "IRQ %08x%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s\n", evt,
+       ohci_notice(ohci, "IRQ %08x%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s\n", evt,
             evt & OHCI1394_selfIDComplete       ? " selfID"             : "",
             evt & OHCI1394_RQPkt                ? " AR_req"             : "",
             evt & OHCI1394_RSPkt                ? " AR_resp"            : "",
@@ -406,21 +424,19 @@ static void log_selfids(struct fw_ohci *ohci, int generation, int self_id_count)
         if (likely(!(param_debug & OHCI_PARAM_DEBUG_SELFIDS)))
                 return;
  
-       dev_notice(ohci->card.device,
-                  "%d selfIDs, generation %d, local node ID %04x\n",
-                  self_id_count, generation, ohci->node_id);
+       ohci_notice(ohci, "%d selfIDs, generation %d, local node ID %04x\n",
+                   self_id_count, generation, ohci->node_id);
  
         for (s = ohci->self_id_buffer; self_id_count--; ++s)
                 if ((*s & 1 << 23) == 0)
-                       dev_notice(ohci->card.device,
-                           "selfID 0: %08x, phy %d [%c%c%c] "
-                           "%s gc=%d %s %s%s%s\n",
+                       ohci_notice(ohci,
+                           "selfID 0: %08x, phy %d [%c%c%c] %s gc=%d %s %s%s%s\n",
                             *s, *s >> 24 & 63, _p(s, 6), _p(s, 4), _p(s, 2),
                             speed[*s >> 14 & 3], *s >> 16 & 63,
                             power[*s >> 8 & 7], *s >> 22 & 1 ? "L" : "",
                             *s >> 11 & 1 ? "c" : "", *s & 2 ? "i" : "");
                 else
-                       dev_notice(ohci->card.device,
+                       ohci_notice(ohci,
                             "selfID n: %08x, phy %d [%c%c%c%c%c%c%c%c]\n",
                             *s, *s >> 24 & 63,
                             _p(s, 16), _p(s, 14), _p(s, 12), _p(s, 10),
@@ -470,9 +486,8 @@ static void log_ar_at_event(struct fw_ohci *ohci,
                         evt = 0x1f;
  
         if (evt == OHCI1394_evt_bus_reset) {
-               dev_notice(ohci->card.device,
-                          "A%c evt_bus_reset, generation %d\n",
-                          dir, (header[2] >> 16) & 0xff);
+               ohci_notice(ohci, "A%c evt_bus_reset, generation %d\n",
+                           dir, (header[2] >> 16) & 0xff);
                 return;
         }
  
@@ -491,32 +506,26 @@ static void log_ar_at_event(struct fw_ohci *ohci,
  
         switch (tcode) {
         case 0xa:
-               dev_notice(ohci->card.device,
-                          "A%c %s, %s\n",
-                          dir, evts[evt], tcodes[tcode]);
+               ohci_notice(ohci, "A%c %s, %s\n",
+                           dir, evts[evt], tcodes[tcode]);
                 break;
         case 0xe:
-               dev_notice(ohci->card.device,
-                          "A%c %s, PHY %08x %08x\n",
-                          dir, evts[evt], header[1], header[2]);
+               ohci_notice(ohci, "A%c %s, PHY %08x %08x\n",
+                           dir, evts[evt], header[1], header[2]);
                 break;
         case 0x0: case 0x1: case 0x4: case 0x5: case 0x9:
-               dev_notice(ohci->card.device,
-                          "A%c spd %x tl %02x, "
-                          "%04x -> %04x, %s, "
-                          "%s, %04x%08x%s\n",
-                          dir, speed, header[0] >> 10 & 0x3f,
-                          header[1] >> 16, header[0] >> 16, evts[evt],
-                          tcodes[tcode], header[1] & 0xffff, header[2], specific);
+               ohci_notice(ohci,
+                           "A%c spd %x tl %02x, %04x -> %04x, %s, %s, %04x%08x%s\n",
+                           dir, speed, header[0] >> 10 & 0x3f,
+                           header[1] >> 16, header[0] >> 16, evts[evt],
+                           tcodes[tcode], header[1] & 0xffff, header[2], specific);
                 break;
         default:
-               dev_notice(ohci->card.device,
-                          "A%c spd %x tl %02x, "
-                          "%04x -> %04x, %s, "
-                          "%s%s\n",
-                          dir, speed, header[0] >> 10 & 0x3f,
-                          header[1] >> 16, header[0] >> 16, evts[evt],
-                          tcodes[tcode], specific);
+               ohci_notice(ohci,
+                           "A%c spd %x tl %02x, %04x -> %04x, %s, %s%s\n",
+                           dir, speed, header[0] >> 10 & 0x3f,
+                           header[1] >> 16, header[0] >> 16, evts[evt],
+                           tcodes[tcode], specific);
         }
  }
  
@@ -563,7 +572,8 @@ static int read_phy_reg(struct fw_ohci *ohci, int addr)
                 if (i >= 3)
                         msleep(1);
         }
-       dev_err(ohci->card.device, "failed to read phy reg\n");
+       ohci_err(ohci, "failed to read phy reg %d\n", addr);
+       dump_stack();
  
         return -EBUSY;
  }
@@ -585,7 +595,8 @@ static int write_phy_reg(const struct fw_ohci *ohci, int addr, u32 val)
                 if (i >= 3)
                         msleep(1);
         }
-       dev_err(ohci->card.device, "failed to write phy reg\n");
+       ohci_err(ohci, "failed to write phy reg %d, val %u\n", addr, val);
+       dump_stack();
  
         return -EBUSY;
  }
@@ -690,8 +701,7 @@ static void ar_context_abort(struct ar_context *ctx, const char *error_msg)
                 reg_write(ohci, CONTROL_CLEAR(ctx->regs), CONTEXT_RUN);
                 flush_writes(ohci);
  
-               dev_err(ohci->card.device, "AR error: %s; DMA stopped\n",
-                       error_msg);
+               ohci_err(ohci, "AR error: %s; DMA stopped\n", error_msg);
         }
         /* FIXME: restart? */
  }
@@ -1157,6 +1167,7 @@ static int context_init(struct context *ctx, struct fw_ohci *ohci,
         ctx->buffer_tail->used += sizeof(*ctx->buffer_tail->buffer);
         ctx->last = ctx->buffer_tail->buffer;
         ctx->prev = ctx->buffer_tail->buffer;
+       ctx->prev_z = 1;
  
         return 0;
  }
@@ -1221,14 +1232,35 @@ static void context_append(struct context *ctx,
  {
         dma_addr_t d_bus;
         struct descriptor_buffer *desc = ctx->buffer_tail;
+       struct descriptor *d_branch;
  
         d_bus = desc->buffer_bus + (d - desc->buffer) * sizeof(*d);
  
         desc->used += (z + extra) * sizeof(*d);
  
         wmb(); /* finish init of new descriptors before branch_address update */
-       ctx->prev->branch_address = cpu_to_le32(d_bus | z);
-       ctx->prev = find_branch_descriptor(d, z);
+
+       d_branch = find_branch_descriptor(ctx->prev, ctx->prev_z);
+       d_branch->branch_address = cpu_to_le32(d_bus | z);
+
+       /*
+        * VT6306 incorrectly checks only the single descriptor at the
+        * CommandPtr when the wake bit is written, so if it's a
+        * multi-descriptor block starting with an INPUT_MORE, put a copy of
+        * the branch address in the first descriptor.
+        *
+        * Not doing this for transmit contexts since not sure how it interacts
+        * with skip addresses.
+        */
+       if (unlikely(ctx->ohci->quirks & QUIRK_IR_WAKE) &&
+           d_branch != ctx->prev &&
+           (ctx->prev->control & cpu_to_le16(DESCRIPTOR_CMD)) ==
+            cpu_to_le16(DESCRIPTOR_INPUT_MORE)) {
+               ctx->prev->branch_address = cpu_to_le32(d_bus | z);
+       }
+
+       ctx->prev = d;
+       ctx->prev_z = z;
  }
  
  static void context_stop(struct context *ctx)
@@ -1248,7 +1280,7 @@ static void context_stop(struct context *ctx)
                 if (i)
                         udelay(10);
         }
-       dev_err(ohci->card.device, "DMA context still active (0x%08x)\n", reg);
+       ohci_err(ohci, "DMA context still active (0x%08x)\n", reg);
  }
  
  struct driver_data {
@@ -1557,7 +1589,7 @@ static void handle_local_lock(struct fw_ohci *ohci,
                         goto out;
                 }
  
-       dev_err(ohci->card.device, "swap not done (CSR lock timeout)\n");
+       ohci_err(ohci, "swap not done (CSR lock timeout)\n");
         fw_fill_response(&response, packet->header, RCODE_BUSY, NULL, 0);
  
   out:
@@ -1632,8 +1664,7 @@ static void detect_dead_context(struct fw_ohci *ohci,
  
         ctl = reg_read(ohci, CONTROL_SET(regs));
         if (ctl & CONTEXT_DEAD)
-               dev_err(ohci->card.device,
-                       "DMA context %s has stopped, error code: %s\n",
+               ohci_err(ohci, "DMA context %s has stopped, error code: %s\n",
                         name, evts[ctl & 0x1f]);
  }
  
@@ -1815,8 +1846,8 @@ static int find_and_insert_self_id(struct fw_ohci *ohci, int self_id_count)
  
         reg = reg_read(ohci, OHCI1394_NodeID);
         if (!(reg & OHCI1394_NodeID_idValid)) {
-               dev_notice(ohci->card.device,
-                          "node ID not valid, new bus reset in progress\n");
+               ohci_notice(ohci,
+                           "node ID not valid, new bus reset in progress\n");
                 return -EBUSY;
         }
         self_id |= ((reg & 0x3f) << 24); /* phy ID */
@@ -1863,12 +1894,12 @@ static void bus_reset_work(struct work_struct *work)
  
         reg = reg_read(ohci, OHCI1394_NodeID);
         if (!(reg & OHCI1394_NodeID_idValid)) {
-               dev_notice(ohci->card.device,
-                          "node ID not valid, new bus reset in progress\n");
+               ohci_notice(ohci,
+                           "node ID not valid, new bus reset in progress\n");
                 return;
         }
         if ((reg & OHCI1394_NodeID_nodeNumber) == 63) {
-               dev_notice(ohci->card.device, "malconfigured bus\n");
+               ohci_notice(ohci, "malconfigured bus\n");
                 return;
         }
         ohci->node_id = reg & (OHCI1394_NodeID_busNumber |
@@ -1882,7 +1913,7 @@ static void bus_reset_work(struct work_struct *work)
  
         reg = reg_read(ohci, OHCI1394_SelfIDCount);
         if (reg & OHCI1394_SelfIDCount_selfIDError) {
-               dev_notice(ohci->card.device, "inconsistent self IDs\n");
+               ohci_notice(ohci, "self ID receive error\n");
                 return;
         }
         /*
@@ -1894,7 +1925,7 @@ static void bus_reset_work(struct work_struct *work)
         self_id_count = (reg >> 3) & 0xff;
  
         if (self_id_count > 252) {
-               dev_notice(ohci->card.device, "inconsistent self IDs\n");
+               ohci_notice(ohci, "bad selfIDSize (%08x)\n", reg);
                 return;
         }
  
@@ -1902,7 +1933,10 @@ static void bus_reset_work(struct work_struct *work)
         rmb();
  
         for (i = 1, j = 0; j < self_id_count; i += 2, j++) {
-               if (ohci->self_id_cpu[i] != ~ohci->self_id_cpu[i + 1]) {
+               u32 id  = cond_le32_to_cpu(ohci->self_id_cpu[i]);
+               u32 id2 = cond_le32_to_cpu(ohci->self_id_cpu[i + 1]);
+
+               if (id != ~id2) {
                         /*
                          * If the invalid data looks like a cycle start packet,
                          * it's likely to be the result of the cycle master
@@ -1910,33 +1944,30 @@ static void bus_reset_work(struct work_struct *work)
                          * so far are valid and should be processed so that the
                          * bus manager can then correct the gap count.
                          */
-                       if (cond_le32_to_cpu(ohci->self_id_cpu[i])
-                                                       == 0xffff008f) {
-                               dev_notice(ohci->card.device,
-                                          "ignoring spurious self IDs\n");
+                       if (id == 0xffff008f) {
+                               ohci_notice(ohci, "ignoring spurious self IDs\n");
                                 self_id_count = j;
                                 break;
-                       } else {
-                               dev_notice(ohci->card.device,
-                                          "inconsistent self IDs\n");
-                               return;
                         }
+
+                       ohci_notice(ohci, "bad self ID %d/%d (%08x != ~%08x)\n",
+                                   j, self_id_count, id, id2);
+                       return;
                 }
-               ohci->self_id_buffer[j] =
-                               cond_le32_to_cpu(ohci->self_id_cpu[i]);
+               ohci->self_id_buffer[j] = id;
         }
  
         if (ohci->quirks & QUIRK_TI_SLLZ059) {
                 self_id_count = find_and_insert_self_id(ohci, self_id_count);
                 if (self_id_count < 0) {
-                       dev_notice(ohci->card.device,
-                                  "could not construct local self ID\n");
+                       ohci_notice(ohci,
+                                   "could not construct local self ID\n");
                         return;
                 }
         }
  
         if (self_id_count == 0) {
-               dev_notice(ohci->card.device, "inconsistent self IDs\n");
+               ohci_notice(ohci, "no self IDs\n");
                 return;
         }
         rmb();
@@ -1957,8 +1988,7 @@ static void bus_reset_work(struct work_struct *work)
  
         new_generation = (reg_read(ohci, OHCI1394_SelfIDCount) >> 16) & 0xff;
         if (new_generation != generation) {
-               dev_notice(ohci->card.device,
-                          "new bus reset, discarding self ids\n");
+               ohci_notice(ohci, "new bus reset, discarding self ids\n");
                 return;
         }
  
@@ -2096,7 +2126,7 @@ static irqreturn_t irq_handler(int irq, void *data)
         }
  
         if (unlikely(event & OHCI1394_regAccessFail))
-               dev_err(ohci->card.device, "register access failure\n");
+               ohci_err(ohci, "register access failure\n");
  
         if (unlikely(event & OHCI1394_postedWriteErr)) {
                 reg_read(ohci, OHCI1394_PostedWriteAddressHi);
@@ -2104,13 +2134,12 @@ static irqreturn_t irq_handler(int irq, void *data)
                 reg_write(ohci, OHCI1394_IntEventClear,
                           OHCI1394_postedWriteErr);
                 if (printk_ratelimit())
-                       dev_err(ohci->card.device, "PCI posted write error\n");
+                       ohci_err(ohci, "PCI posted write error\n");
         }
  
         if (unlikely(event & OHCI1394_cycleTooLong)) {
                 if (printk_ratelimit())
-                       dev_notice(ohci->card.device,
-                                  "isochronous cycle too long\n");
+                       ohci_notice(ohci, "isochronous cycle too long\n");
                 reg_write(ohci, OHCI1394_LinkControlSet,
                           OHCI1394_LinkControl_cycleMaster);
         }
@@ -2123,8 +2152,7 @@ static irqreturn_t irq_handler(int irq, void *data)
                  * them at least two cycles later.  (FIXME?)
                  */
                 if (printk_ratelimit())
-                       dev_notice(ohci->card.device,
-                                  "isochronous cycle inconsistent\n");
+                       ohci_notice(ohci, "isochronous cycle inconsistent\n");
         }
  
         if (unlikely(event & OHCI1394_unrecoverableError))
@@ -2246,12 +2274,11 @@ static int ohci_enable(struct fw_card *card,
                        const __be32 *config_rom, size_t length)
  {
         struct fw_ohci *ohci = fw_ohci(card);
-       struct pci_dev *dev = to_pci_dev(card->device);
         u32 lps, version, irqs;
         int i, ret;
  
         if (software_reset(ohci)) {
-               dev_err(card->device, "failed to reset ohci card\n");
+               ohci_err(ohci, "failed to reset ohci card\n");
                 return -EBUSY;
         }
  
@@ -2262,20 +2289,31 @@ static int ohci_enable(struct fw_card *card,
          * will lock up the machine.  Wait 50msec to make sure we have
          * full link enabled.  However, with some cards (well, at least
          * a JMicron PCIe card), we have to try again sometimes.
+        *
+        * TI TSB82AA2 + TSB81BA3(A) cards signal LPS enabled early but
+        * cannot actually use the phy at that time.  These need tens of
+        * millisecods pause between LPS write and first phy access too.
+        *
+        * But do not wait for 50msec on Agere/LSI cards.  Their phy
+        * arbitration state machine may time out during such a long wait.
          */
+
         reg_write(ohci, OHCI1394_HCControlSet,
                   OHCI1394_HCControl_LPS |
                   OHCI1394_HCControl_postedWriteEnable);
         flush_writes(ohci);
  
-       for (lps = 0, i = 0; !lps && i < 3; i++) {
+       if (!(ohci->quirks & QUIRK_PHY_LCTRL_TIMEOUT))
                 msleep(50);
+
+       for (lps = 0, i = 0; !lps && i < 150; i++) {
+               msleep(1);
                 lps = reg_read(ohci, OHCI1394_HCControlSet) &
                       OHCI1394_HCControl_LPS;
         }
  
         if (!lps) {
-               dev_err(card->device, "failed to set Link Power Status\n");
+               ohci_err(ohci, "failed to set Link Power Status\n");
                 return -EIO;
         }
  
@@ -2284,7 +2322,7 @@ static int ohci_enable(struct fw_card *card,
                 if (ret < 0)
                         return ret;
                 if (ret)
-                       dev_notice(card->device, "local TSB41BA3D phy\n");
+                       ohci_notice(ohci, "local TSB41BA3D phy\n");
                 else
                         ohci->quirks &= ~QUIRK_TI_SLLZ059;
         }
@@ -2382,24 +2420,6 @@ static int ohci_enable(struct fw_card *card,
  
         reg_write(ohci, OHCI1394_AsReqFilterHiSet, 0x80000000);
  
-       if (!(ohci->quirks & QUIRK_NO_MSI))
-               pci_enable_msi(dev);
-       if (request_irq(dev->irq, irq_handler,
-                       pci_dev_msi_enabled(dev) ? 0 : IRQF_SHARED,
-                       ohci_driver_name, ohci)) {
-               dev_err(card->device, "failed to allocate interrupt %d\n",
-                       dev->irq);
-               pci_disable_msi(dev);
-
-               if (config_rom) {
-                       dma_free_coherent(ohci->card.device, CONFIG_ROM_SIZE,
-                                         ohci->next_config_rom,
-                                         ohci->next_config_rom_bus);
-                       ohci->next_config_rom = NULL;
-               }
-               return -EIO;
-       }
-
         irqs =  OHCI1394_reqTxComplete | OHCI1394_respTxComplete |
                 OHCI1394_RQPkt | OHCI1394_RSPkt |
                 OHCI1394_isochTx | OHCI1394_isochRx |
@@ -3578,20 +3598,20 @@ static int pci_probe(struct pci_dev *dev,
  
         if (!(pci_resource_flags(dev, 0) & IORESOURCE_MEM) ||
             pci_resource_len(dev, 0) < OHCI1394_REGISTER_SIZE) {
-               dev_err(&dev->dev, "invalid MMIO resource\n");
+               ohci_err(ohci, "invalid MMIO resource\n");
                 err = -ENXIO;
                 goto fail_disable;
         }
  
         err = pci_request_region(dev, 0, ohci_driver_name);
         if (err) {
-               dev_err(&dev->dev, "MMIO resource unavailable\n");
+               ohci_err(ohci, "MMIO resource unavailable\n");
                 goto fail_disable;
         }
  
         ohci->registers = pci_iomap(dev, 0, OHCI1394_REGISTER_SIZE);
         if (ohci->registers == NULL) {
-               dev_err(&dev->dev, "failed to remap registers\n");
+               ohci_err(ohci, "failed to remap registers\n");
                 err = -ENXIO;
                 goto fail_iomem;
         }
@@ -3675,19 +3695,33 @@ static int pci_probe(struct pci_dev *dev,
         guid = ((u64) reg_read(ohci, OHCI1394_GUIDHi) << 32) |
                 reg_read(ohci, OHCI1394_GUIDLo);
  
+       if (!(ohci->quirks & QUIRK_NO_MSI))
+               pci_enable_msi(dev);
+       if (request_irq(dev->irq, irq_handler,
+                       pci_dev_msi_enabled(dev) ? 0 : IRQF_SHARED,
+                       ohci_driver_name, ohci)) {
+               ohci_err(ohci, "failed to allocate interrupt %d\n", dev->irq);
+               err = -EIO;
+               goto fail_msi;
+       }
+
         err = fw_card_add(&ohci->card, max_receive, link_speed, guid);
         if (err)
-               goto fail_contexts;
+               goto fail_irq;
  
         version = reg_read(ohci, OHCI1394_Version) & 0x00ff00ff;
-       dev_notice(&dev->dev,
-                 "added OHCI v%x.%x device as card %d, "
-                 "%d IR + %d IT contexts, quirks 0x%x\n",
-                 version >> 16, version & 0xff, ohci->card.index,
-                 ohci->n_ir, ohci->n_it, ohci->quirks);
+       ohci_notice(ohci,
+                   "added OHCI v%x.%x device as card %d, "
+                   "%d IR + %d IT contexts, quirks 0x%x\n",
+                   version >> 16, version & 0xff, ohci->card.index,
+                   ohci->n_ir, ohci->n_it, ohci->quirks);
  
         return 0;
  
+ fail_irq:
+       free_irq(dev->irq, ohci);
+ fail_msi:
+       pci_disable_msi(dev);
   fail_contexts:
         kfree(ohci->ir_context_list);
         kfree(ohci->it_context_list);
@@ -3711,19 +3745,21 @@ static int pci_probe(struct pci_dev *dev,
         kfree(ohci);
         pmac_ohci_off(dev);
   fail:
-       if (err == -ENOMEM)
-               dev_err(&dev->dev, "out of memory\n");
-
         return err;
  }
  
  static void pci_remove(struct pci_dev *dev)
  {
-       struct fw_ohci *ohci;
+       struct fw_ohci *ohci = pci_get_drvdata(dev);
  
-       ohci = pci_get_drvdata(dev);
-       reg_write(ohci, OHCI1394_IntMaskClear, ~0);
-       flush_writes(ohci);
+       /*
+        * If the removal is happening from the suspend state, LPS won't be
+        * enabled and host registers (eg., IntMaskClear) won't be accessible.
+        */
+       if (reg_read(ohci, OHCI1394_HCControlSet) & OHCI1394_HCControl_LPS) {
+               reg_write(ohci, OHCI1394_IntMaskClear, ~0);
+               flush_writes(ohci);
+       }
         cancel_work_sync(&ohci->bus_reset_work);
         fw_core_remove_card(&ohci->card);
  
@@ -3766,16 +3802,14 @@ static int pci_suspend(struct pci_dev *dev, pm_message_t state)
         int err;
  
         software_reset(ohci);
-       free_irq(dev->irq, ohci);
-       pci_disable_msi(dev);
         err = pci_save_state(dev);
         if (err) {
-               dev_err(&dev->dev, "pci_save_state failed\n");
+               ohci_err(ohci, "pci_save_state failed\n");
                 return err;
         }
         err = pci_set_power_state(dev, pci_choose_state(dev, state));
         if (err)
-               dev_err(&dev->dev, "pci_set_power_state failed with %d\n", err);
+               ohci_err(ohci, "pci_set_power_state failed with %d\n", err);
         pmac_ohci_off(dev);
  
         return 0;
@@ -3791,7 +3825,7 @@ static int pci_resume(struct pci_dev *dev)
         pci_restore_state(dev);
         err = pci_enable_device(dev);
         if (err) {
-               dev_err(&dev->dev, "pci_enable_device failed\n");
+               ohci_err(ohci, "pci_enable_device failed\n");
                 return err;
         }
  
@@ -3837,6 +3871,4 @@ MODULE_DESCRIPTION("Driver for PCI OHCI IEEE1394 controllers");
  MODULE_LICENSE("GPL");
  
  /* Provide a module alias so root-on-sbp2 initrds don't break. */
-#ifndef CONFIG_IEEE1394_OHCI1394_MODULE
  MODULE_ALIAS("ohci1394");
-#endif
diff --git a/drivers/firewire/sbp2.c b/drivers/firewire/sbp2.c

index 1162d6b3bf8561d6ed1cfe399643dff6deb4027b..47674b91384321bd16b4dfa5c4ec69c190a80551 100644 (file)
--- a/drivers/firewire/sbp2.c
+++ b/drivers/firewire/sbp2.c
@@ -1144,8 +1144,8 @@ static int sbp2_probe(struct device *dev)
                 return -ENODEV;
  
         if (dma_get_max_seg_size(device->card->device) > SBP2_MAX_SEG_SIZE)
-               BUG_ON(dma_set_max_seg_size(device->card->device,
-                                           SBP2_MAX_SEG_SIZE));
+               WARN_ON(dma_set_max_seg_size(device->card->device,
+                                            SBP2_MAX_SEG_SIZE));
  
         shost = scsi_host_alloc(&scsi_driver_template, sizeof(*tgt));
         if (shost == NULL)
@@ -1475,10 +1475,8 @@ static int sbp2_scsi_queuecommand(struct Scsi_Host *shost,
         }
  
         orb = kzalloc(sizeof(*orb), GFP_ATOMIC);
-       if (orb == NULL) {
-               dev_notice(lu_dev(lu), "failed to alloc ORB\n");
+       if (orb == NULL)
                 return SCSI_MLQUEUE_HOST_BUSY;
-       }
  
         /* Initialize rcode to something not RCODE_COMPLETE. */
         orb->base.rcode = -1;
@@ -1636,9 +1634,7 @@ MODULE_LICENSE("GPL");
  MODULE_DEVICE_TABLE(ieee1394, sbp2_id_table);
  
  /* Provide a module alias so root-on-sbp2 initrds don't break. */
-#ifndef CONFIG_IEEE1394_SBP2_MODULE
  MODULE_ALIAS("sbp2");
-#endif
  
  static int __init sbp2_init(void)
  {
diff --git a/drivers/gpio/Kconfig b/drivers/gpio/Kconfig

index c22eed9481e3afd4f6d7dcc8b583e347d500491e..87d567089f13653297ddc001ec0fda8746254465 100644 (file)
--- a/drivers/gpio/Kconfig
+++ b/drivers/gpio/Kconfig
@@ -38,7 +38,6 @@ config GPIO_DEVRES
  menuconfig GPIOLIB
         bool "GPIO Support"
         depends on ARCH_WANT_OPTIONAL_GPIOLIB || ARCH_REQUIRE_GPIOLIB
-       select GENERIC_GPIO
         help
           This enables GPIO support through the generic GPIO library.
           You only need to enable this, if you also want to enable
diff --git a/drivers/gpio/gpio-lpc32xx.c b/drivers/gpio/gpio-lpc32xx.c

index dda6a756a3d9946b6d51693a9e09fe48c3cd4ac9..90a80eb688a920ca594c144264d15b158b113af0 100644 (file)
--- a/drivers/gpio/gpio-lpc32xx.c
+++ b/drivers/gpio/gpio-lpc32xx.c
@@ -255,7 +255,7 @@ static int __get_gpo_state_p3(struct lpc32xx_gpio_chip *group,
  }
  
  /*
- * GENERIC_GPIO primitives.
+ * GPIO primitives.
   */
  static int lpc32xx_gpio_dir_input_p012(struct gpio_chip *chip,
         unsigned pin)
diff --git a/drivers/hwspinlock/Kconfig b/drivers/hwspinlock/Kconfig

index c7c3128393d1dcdce0589d5a268893479ae61804..70637d23b1f92ad6688300060dadeb4f892ccff9 100644 (file)
--- a/drivers/hwspinlock/Kconfig
+++ b/drivers/hwspinlock/Kconfig
@@ -10,7 +10,7 @@ menu "Hardware Spinlock drivers"
  
  config HWSPINLOCK_OMAP
         tristate "OMAP Hardware Spinlock device"
-       depends on ARCH_OMAP4
+       depends on ARCH_OMAP4 || SOC_OMAP5
         select HWSPINLOCK
         help
           Say y here to support the OMAP Hardware Spinlock device (firstly
diff --git a/drivers/i2c/busses/Kconfig b/drivers/i2c/busses/Kconfig

index adfee98486b159843b38ff107c8ea9dd25189ed5..631736e2e7ed94297a0f871615d6d5b6c9e6acd3 100644 (file)
--- a/drivers/i2c/busses/Kconfig
+++ b/drivers/i2c/busses/Kconfig
@@ -363,7 +363,7 @@ config I2C_BLACKFIN_TWI_CLK_KHZ
  
  config I2C_CBUS_GPIO
         tristate "CBUS I2C driver"
-       depends on GENERIC_GPIO
+       depends on GPIOLIB
         help
           Support for CBUS access using I2C API. Mostly relevant for Nokia
           Internet Tablets (770, N800 and N810).
@@ -436,7 +436,7 @@ config I2C_EG20T
  
  config I2C_GPIO
         tristate "GPIO-based bitbanging I2C"
-       depends on GENERIC_GPIO
+       depends on GPIOLIB
         select I2C_ALGOBIT
         help
           This is a very simple bitbanging I2C driver utilizing the
diff --git a/drivers/i2c/muxes/Kconfig b/drivers/i2c/muxes/Kconfig

index 5faf244d2476bc35f245c067c767226af37a3d05..f7f9865b8b898864d4d711ada8bb8591ce289529 100644 (file)
--- a/drivers/i2c/muxes/Kconfig
+++ b/drivers/i2c/muxes/Kconfig
@@ -7,7 +7,7 @@ menu "Multiplexer I2C Chip support"
  
  config I2C_ARB_GPIO_CHALLENGE
         tristate "GPIO-based I2C arbitration"
-       depends on GENERIC_GPIO && OF
+       depends on GPIOLIB && OF
         help
           If you say yes to this option, support will be included for an
           I2C multimaster arbitration scheme using GPIOs and a challenge &
@@ -19,7 +19,7 @@ config I2C_ARB_GPIO_CHALLENGE
  
  config I2C_MUX_GPIO
         tristate "GPIO-based I2C multiplexer"
-       depends on GENERIC_GPIO
+       depends on GPIOLIB
         help
           If you say yes to this option, support will be included for a
           GPIO based I2C multiplexer. This driver provides access to
diff --git a/drivers/ide/ide-cd.c b/drivers/ide/ide-cd.c

index b231139263883e128bc084329d08fcebf4ee06a0..2ff6204449300e088883d411b3c4f0f46d2eabaf 100644 (file)
--- a/drivers/ide/ide-cd.c
+++ b/drivers/ide/ide-cd.c
@@ -1606,7 +1606,7 @@ out:
         return rc;
  }
  
-static int idecd_release(struct gendisk *disk, fmode_t mode)
+static void idecd_release(struct gendisk *disk, fmode_t mode)
  {
         struct cdrom_info *info = ide_drv_g(disk, cdrom_info);
  
@@ -1615,8 +1615,6 @@ static int idecd_release(struct gendisk *disk, fmode_t mode)
  
         ide_cd_put(info);
         mutex_unlock(&ide_cd_mutex);
-
-       return 0;
  }
  
  static int idecd_set_spindown(struct cdrom_device_info *cdi, unsigned long arg)
diff --git a/drivers/ide/ide-gd.c b/drivers/ide/ide-gd.c

index 70ea8763567dcc7b1b7fd946275dea49303534f6..de86631e767d6d08b6c500e3162aa5ca50551c85 100644 (file)
--- a/drivers/ide/ide-gd.c
+++ b/drivers/ide/ide-gd.c
@@ -250,7 +250,7 @@ static int ide_gd_unlocked_open(struct block_device *bdev, fmode_t mode)
  }
  
  
-static int ide_gd_release(struct gendisk *disk, fmode_t mode)
+static void ide_gd_release(struct gendisk *disk, fmode_t mode)
  {
         struct ide_disk_obj *idkp = ide_drv_g(disk, ide_disk_obj);
         ide_drive_t *drive = idkp->drive;
@@ -270,8 +270,6 @@ static int ide_gd_release(struct gendisk *disk, fmode_t mode)
  
         ide_disk_put(idkp);
         mutex_unlock(&ide_gd_mutex);
-
-       return 0;
  }
  
  static int ide_gd_getgeo(struct block_device *bdev, struct hd_geometry *geo)
diff --git a/drivers/ide/ide-tape.c b/drivers/ide/ide-tape.c

index 89f859591bbb4ff49e1ac5ddde8139607e78ec9a..c6c574bd5f59e079199072e0f27311804f4faff4 100644 (file)
--- a/drivers/ide/ide-tape.c
+++ b/drivers/ide/ide-tape.c
@@ -1918,15 +1918,13 @@ static int idetape_open(struct block_device *bdev, fmode_t mode)
         return 0;
  }
  
-static int idetape_release(struct gendisk *disk, fmode_t mode)
+static void idetape_release(struct gendisk *disk, fmode_t mode)
  {
         struct ide_tape_obj *tape = ide_drv_g(disk, ide_tape_obj);
  
         mutex_lock(&ide_tape_mutex);
         ide_tape_put(tape);
         mutex_unlock(&ide_tape_mutex);
-
-       return 0;
  }
  
  static int idetape_ioctl(struct block_device *bdev, fmode_t mode,
diff --git a/drivers/infiniband/core/iwcm.c b/drivers/infiniband/core/iwcm.c

index 0bb99bb38809f12e7a1c2b7559f77e9a6744c9d0..c47c2034ca71f9a95f3153fbc31756fc34b1fde0 100644 (file)
--- a/drivers/infiniband/core/iwcm.c
+++ b/drivers/infiniband/core/iwcm.c
@@ -878,6 +878,8 @@ static void cm_work_handler(struct work_struct *_work)
                         }
                         return;
                 }
+               if (empty)
+                       return;
                 spin_lock_irqsave(&cm_id_priv->lock, flags);
         }
         spin_unlock_irqrestore(&cm_id_priv->lock, flags);
diff --git a/drivers/infiniband/core/verbs.c b/drivers/infiniband/core/verbs.c

index a8fdd3381405dc62aa92118954b83e7ec0fdb8db..22192deb88282b51f195521e6468840655bb219e 100644 (file)
--- a/drivers/infiniband/core/verbs.c
+++ b/drivers/infiniband/core/verbs.c
@@ -348,7 +348,8 @@ static void __ib_shared_qp_event_handler(struct ib_event *event, void *context)
         struct ib_qp *qp = context;
  
         list_for_each_entry(event->element.qp, &qp->open_list, open_list)
-               event->element.qp->event_handler(event, event->element.qp->qp_context);
+               if (event->element.qp->event_handler)
+                       event->element.qp->event_handler(event, event->element.qp->qp_context);
  }
  
  static void __ib_insert_xrcd_qp(struct ib_xrcd *xrcd, struct ib_qp *qp)
diff --git a/drivers/infiniband/hw/cxgb3/cxio_resource.c b/drivers/infiniband/hw/cxgb3/cxio_resource.c

index 31f9201b29809512ae5a4140456e0b457fa5e9cc..c40088ecf9f3b5fed843f88815997f593d6177b5 100644 (file)
--- a/drivers/infiniband/hw/cxgb3/cxio_resource.c
+++ b/drivers/infiniband/hw/cxgb3/cxio_resource.c
@@ -62,13 +62,13 @@ static int __cxio_init_resource_fifo(struct kfifo *fifo,
                 kfifo_in(fifo, (unsigned char *) &entry, sizeof(u32));
         if (random) {
                 j = 0;
-               random_bytes = random32();
+               random_bytes = prandom_u32();
                 for (i = 0; i < RANDOM_SIZE; i++)
                         rarray[i] = i + skip_low;
                 for (i = skip_low + RANDOM_SIZE; i < nr - skip_high; i++) {
                         if (j >= RANDOM_SIZE) {
                                 j = 0;
-                               random_bytes = random32();
+                               random_bytes = prandom_u32();
                         }
                         idx = (random_bytes >> (j * 2)) & 0xF;
                         kfifo_in(fifo,
diff --git a/drivers/infiniband/hw/cxgb3/iwch_provider.c b/drivers/infiniband/hw/cxgb3/iwch_provider.c

index 9c12da0cbd32539b5a8060a7a6deed8101cdd25d..e87f2201b220673030ca18a3957642b623d065a7 100644 (file)
--- a/drivers/infiniband/hw/cxgb3/iwch_provider.c
+++ b/drivers/infiniband/hw/cxgb3/iwch_provider.c
@@ -559,7 +559,7 @@ static int iwch_reregister_phys_mem(struct ib_mr *mr,
         __be64 *page_list = NULL;
         int shift = 0;
         u64 total_size;
-       int npages;
+       int npages = 0;
         int ret;
  
         PDBG("%s ib_mr %p ib_pd %p\n", __func__, mr, pd);
diff --git a/drivers/infiniband/hw/cxgb4/id_table.c b/drivers/infiniband/hw/cxgb4/id_table.c

index f95e5df30db20682b2199120fa9ba8d2e8523d56..0161ae6ad6293680ce6665e3f1bdb07b1fd0f89b 100644 (file)
--- a/drivers/infiniband/hw/cxgb4/id_table.c
+++ b/drivers/infiniband/hw/cxgb4/id_table.c
@@ -54,7 +54,7 @@ u32 c4iw_id_alloc(struct c4iw_id_table *alloc)
  
         if (obj < alloc->max) {
                 if (alloc->flags & C4IW_ID_TABLE_F_RANDOM)
-                       alloc->last += random32() % RANDOM_SKIP;
+                       alloc->last += prandom_u32() % RANDOM_SKIP;
                 else
                         alloc->last = obj + 1;
                 if (alloc->last >= alloc->max)
@@ -88,7 +88,7 @@ int c4iw_id_table_alloc(struct c4iw_id_table *alloc, u32 start, u32 num,
         alloc->start = start;
         alloc->flags = flags;
         if (flags & C4IW_ID_TABLE_F_RANDOM)
-               alloc->last = random32() % RANDOM_SKIP;
+               alloc->last = prandom_u32() % RANDOM_SKIP;
         else
                 alloc->last = 0;
         alloc->max  = num;
diff --git a/drivers/infiniband/hw/cxgb4/qp.c b/drivers/infiniband/hw/cxgb4/qp.c

index 5b059e2d80cc18019b060a7143bdf59098879ef0..232040447e8a23803cc73e64c203734d36492c2e 100644 (file)
--- a/drivers/infiniband/hw/cxgb4/qp.c
+++ b/drivers/infiniband/hw/cxgb4/qp.c
@@ -111,6 +111,16 @@ static int alloc_host_sq(struct c4iw_rdev *rdev, struct t4_sq *sq)
         return 0;
  }
  
+static int alloc_sq(struct c4iw_rdev *rdev, struct t4_sq *sq, int user)
+{
+       int ret = -ENOSYS;
+       if (user)
+               ret = alloc_oc_sq(rdev, sq);
+       if (ret)
+               ret = alloc_host_sq(rdev, sq);
+       return ret;
+}
+
  static int destroy_qp(struct c4iw_rdev *rdev, struct t4_wq *wq,
                       struct c4iw_dev_ucontext *uctx)
  {
@@ -179,15 +189,9 @@ static int create_qp(struct c4iw_rdev *rdev, struct t4_wq *wq,
                 goto free_sw_rq;
         }
  
-       if (user) {
-               if (alloc_oc_sq(rdev, &wq->sq) && alloc_host_sq(rdev, &wq->sq))
-                       goto free_hwaddr;
-       } else {
-               ret = alloc_host_sq(rdev, &wq->sq);
-               if (ret)
-                       goto free_hwaddr;
-       }
-
+       ret = alloc_sq(rdev, &wq->sq, user);
+       if (ret)
+               goto free_hwaddr;
         memset(wq->sq.queue, 0, wq->sq.memsize);
         dma_unmap_addr_set(&wq->sq, mapping, wq->sq.dma_addr);
  
diff --git a/drivers/infiniband/hw/ipath/ipath_file_ops.c b/drivers/infiniband/hw/ipath/ipath_file_ops.c

index aed8afee56da16a6a3609a247c9bea2c54060c44..6d7f453b4d05ef7da7f74aeafe22608b85dc00fc 100644 (file)
--- a/drivers/infiniband/hw/ipath/ipath_file_ops.c
+++ b/drivers/infiniband/hw/ipath/ipath_file_ops.c
@@ -40,6 +40,7 @@
  #include <linux/slab.h>
  #include <linux/highmem.h>
  #include <linux/io.h>
+#include <linux/aio.h>
  #include <linux/jiffies.h>
  #include <linux/cpu.h>
  #include <asm/pgtable.h>
diff --git a/drivers/infiniband/hw/ipath/ipath_verbs.c b/drivers/infiniband/hw/ipath/ipath_verbs.c

index ea93870266eb7fda30ddf502b5d233fb1116423b..44ea9390417ceb0a572058ff9097be0d3871fe8e 100644 (file)
--- a/drivers/infiniband/hw/ipath/ipath_verbs.c
+++ b/drivers/infiniband/hw/ipath/ipath_verbs.c
@@ -2187,7 +2187,8 @@ int ipath_register_ib_device(struct ipath_devdata *dd)
         if (ret)
                 goto err_reg;
  
-       if (ipath_verbs_register_sysfs(dev))
+       ret = ipath_verbs_register_sysfs(dev);
+       if (ret)
                 goto err_class;
  
         enable_timer(dd);
@@ -2327,15 +2328,15 @@ static int ipath_verbs_register_sysfs(struct ib_device *dev)
         int i;
         int ret;
  
-       for (i = 0; i < ARRAY_SIZE(ipath_class_attributes); ++i)
-               if (device_create_file(&dev->dev,
-                                      ipath_class_attributes[i])) {
-                       ret = 1;
+       for (i = 0; i < ARRAY_SIZE(ipath_class_attributes); ++i) {
+               ret = device_create_file(&dev->dev,
+                                      ipath_class_attributes[i]);
+               if (ret)
                         goto bail;
-               }
-
-       ret = 0;
-
+       }
+       return 0;
  bail:
+       for (i = 0; i < ARRAY_SIZE(ipath_class_attributes); ++i)
+               device_remove_file(&dev->dev, ipath_class_attributes[i]);
         return ret;
  }
diff --git a/drivers/infiniband/hw/mlx4/cq.c b/drivers/infiniband/hw/mlx4/cq.c

index 73b3a7132587b6fac4e05dfe33ed215c98dc104f..d5e60f44ba5ad7c4f5ad53d7e0fe348248ed797b 100644 (file)
--- a/drivers/infiniband/hw/mlx4/cq.c
+++ b/drivers/infiniband/hw/mlx4/cq.c
@@ -33,6 +33,7 @@
  
  #include <linux/mlx4/cq.h>
  #include <linux/mlx4/qp.h>
+#include <linux/mlx4/srq.h>
  #include <linux/slab.h>
  
  #include "mlx4_ib.h"
@@ -585,6 +586,7 @@ static int mlx4_ib_poll_one(struct mlx4_ib_cq *cq,
         struct mlx4_qp *mqp;
         struct mlx4_ib_wq *wq;
         struct mlx4_ib_srq *srq;
+       struct mlx4_srq *msrq = NULL;
         int is_send;
         int is_error;
         u32 g_mlpath_rqpn;
@@ -653,6 +655,20 @@ repoll:
  
         wc->qp = &(*cur_qp)->ibqp;
  
+       if (wc->qp->qp_type == IB_QPT_XRC_TGT) {
+               u32 srq_num;
+               g_mlpath_rqpn = be32_to_cpu(cqe->g_mlpath_rqpn);
+               srq_num       = g_mlpath_rqpn & 0xffffff;
+               /* SRQ is also in the radix tree */
+               msrq = mlx4_srq_lookup(to_mdev(cq->ibcq.device)->dev,
+                                      srq_num);
+               if (unlikely(!msrq)) {
+                       pr_warn("CQ %06x with entry for unknown SRQN %06x\n",
+                               cq->mcq.cqn, srq_num);
+                       return -EINVAL;
+               }
+       }
+
         if (is_send) {
                 wq = &(*cur_qp)->sq;
                 if (!(*cur_qp)->sq_signal_bits) {
@@ -666,6 +682,11 @@ repoll:
                 wqe_ctr = be16_to_cpu(cqe->wqe_index);
                 wc->wr_id = srq->wrid[wqe_ctr];
                 mlx4_ib_free_srq_wqe(srq, wqe_ctr);
+       } else if (msrq) {
+               srq = to_mibsrq(msrq);
+               wqe_ctr = be16_to_cpu(cqe->wqe_index);
+               wc->wr_id = srq->wrid[wqe_ctr];
+               mlx4_ib_free_srq_wqe(srq, wqe_ctr);
         } else {
                 wq        = &(*cur_qp)->rq;
                 tail      = wq->tail & (wq->wqe_cnt - 1);
diff --git a/drivers/infiniband/hw/mlx4/mad.c b/drivers/infiniband/hw/mlx4/mad.c

index 934792c477bccbf2b2583453b405067165a1ebf7..4d599cedbb0b1ccceab997a4945be0d8d6e89fde 100644 (file)
--- a/drivers/infiniband/hw/mlx4/mad.c
+++ b/drivers/infiniband/hw/mlx4/mad.c
@@ -93,7 +93,7 @@ static void __propagate_pkey_ev(struct mlx4_ib_dev *dev, int port_num,
  __be64 mlx4_ib_gen_node_guid(void)
  {
  #define NODE_GUID_HI   ((u64) (((u64)IB_OPENIB_OUI) << 40))
-       return cpu_to_be64(NODE_GUID_HI | random32());
+       return cpu_to_be64(NODE_GUID_HI | prandom_u32());
  }
  
  __be64 mlx4_ib_get_new_demux_tid(struct mlx4_ib_demux_ctx *ctx)
diff --git a/drivers/infiniband/hw/mlx4/qp.c b/drivers/infiniband/hw/mlx4/qp.c

index 35cced2a4da835b1254f7e7a76bc1d6b84752057..4f10af2905b505e1b9ae5c8a2ab13a634cdc4b4a 100644 (file)
--- a/drivers/infiniband/hw/mlx4/qp.c
+++ b/drivers/infiniband/hw/mlx4/qp.c
@@ -1292,6 +1292,8 @@ static int __mlx4_ib_modify_qp(struct ib_qp *ibqp,
         if (cur_state == IB_QPS_RESET && new_state == IB_QPS_INIT) {
                 context->sq_size_stride |= !!qp->sq_no_prefetch << 7;
                 context->xrcd = cpu_to_be32((u32) qp->xrcdn);
+               if (ibqp->qp_type == IB_QPT_RAW_PACKET)
+                       context->param3 |= cpu_to_be32(1 << 30);
         }
  
         if (qp->ibqp.uobject)
@@ -1458,6 +1460,10 @@ static int __mlx4_ib_modify_qp(struct ib_qp *ibqp,
                 }
         }
  
+       if (qp->ibqp.qp_type == IB_QPT_RAW_PACKET)
+               context->pri_path.ackto = (context->pri_path.ackto & 0xf8) |
+                                       MLX4_IB_LINK_TYPE_ETH;
+
         if (cur_state == IB_QPS_RTS && new_state == IB_QPS_SQD  &&
             attr_mask & IB_QP_EN_SQD_ASYNC_NOTIFY && attr->en_sqd_async_notify)
                 sqd_event = 1;
diff --git a/drivers/infiniband/hw/qib/qib_file_ops.c b/drivers/infiniband/hw/qib/qib_file_ops.c

index 4f7aa301b3b1171b7d6557332278a27ccb6f6659..b56c9428f3c5f5fbf574b0b9570ec2caa02cbee3 100644 (file)
--- a/drivers/infiniband/hw/qib/qib_file_ops.c
+++ b/drivers/infiniband/hw/qib/qib_file_ops.c
@@ -39,7 +39,7 @@
  #include <linux/vmalloc.h>
  #include <linux/highmem.h>
  #include <linux/io.h>
-#include <linux/uio.h>
+#include <linux/aio.h>
  #include <linux/jiffies.h>
  #include <asm/pgtable.h>
  #include <linux/delay.h>
diff --git a/drivers/infiniband/hw/qib/qib_sysfs.c b/drivers/infiniband/hw/qib/qib_sysfs.c

index 034cc821de5ce9e5172046ee7b05f1fc822c5e88..3c8e4e3caca6240175bbddb35fb304107179920e 100644 (file)
--- a/drivers/infiniband/hw/qib/qib_sysfs.c
+++ b/drivers/infiniband/hw/qib/qib_sysfs.c
@@ -808,10 +808,14 @@ int qib_verbs_register_sysfs(struct qib_devdata *dd)
         for (i = 0; i < ARRAY_SIZE(qib_attributes); ++i) {
                 ret = device_create_file(&dev->dev, qib_attributes[i]);
                 if (ret)
-                       return ret;
+                       goto bail;
         }
  
         return 0;
+bail:
+       for (i = 0; i < ARRAY_SIZE(qib_attributes); ++i)
+               device_remove_file(&dev->dev, qib_attributes[i]);
+       return ret;
  }
  
  /*
diff --git a/drivers/infiniband/hw/qib/qib_verbs.c b/drivers/infiniband/hw/qib/qib_verbs.c

index 7c0ab16a2fe230fc31b44fd43b04f3e92d2a26b8..904c384aa36142b95455066cd6fcf80510b80959 100644 (file)
--- a/drivers/infiniband/hw/qib/qib_verbs.c
+++ b/drivers/infiniband/hw/qib/qib_verbs.c
@@ -2234,7 +2234,8 @@ int qib_register_ib_device(struct qib_devdata *dd)
         if (ret)
                 goto err_agents;
  
-       if (qib_verbs_register_sysfs(dd))
+       ret = qib_verbs_register_sysfs(dd);
+       if (ret)
                 goto err_class;
  
         goto bail;
diff --git a/drivers/infiniband/ulp/ipoib/ipoib_cm.c b/drivers/infiniband/ulp/ipoib/ipoib_cm.c

index 1ef880de3a41d97fe328951069f3b6c30066daf1..3eceb61e3532844555b2656b42f50a4ca42664dc 100644 (file)
--- a/drivers/infiniband/ulp/ipoib/ipoib_cm.c
+++ b/drivers/infiniband/ulp/ipoib/ipoib_cm.c
@@ -460,7 +460,7 @@ static int ipoib_cm_req_handler(struct ib_cm_id *cm_id, struct ib_cm_event *even
                 goto err_qp;
         }
  
-       psn = random32() & 0xffffff;
+       psn = prandom_u32() & 0xffffff;
         ret = ipoib_cm_modify_rx_qp(dev, cm_id, p->qp, psn);
         if (ret)
                 goto err_modify;
diff --git a/drivers/infiniband/ulp/ipoib/ipoib_main.c b/drivers/infiniband/ulp/ipoib/ipoib_main.c

index 554b9063da5492aab4763051146121eca6774e84..b6e049a3c7a853b92d5c9d126c61a2d850f290cb 100644 (file)
--- a/drivers/infiniband/ulp/ipoib/ipoib_main.c
+++ b/drivers/infiniband/ulp/ipoib/ipoib_main.c
@@ -830,7 +830,7 @@ static int ipoib_hard_header(struct sk_buff *skb,
          */
         memcpy(cb->hwaddr, daddr, INFINIBAND_ALEN);
  
-       return 0;
+       return sizeof *header;
  }
  
  static void ipoib_set_mcast_list(struct net_device *dev)
diff --git a/drivers/infiniband/ulp/iser/iscsi_iser.c b/drivers/infiniband/ulp/iser/iscsi_iser.c

index 0ab8c9cc3a7893dec0112ecf7e21d4eb13c875dd..f19b0998a53cfbdffd644a627681864dfca049da 100644 (file)
--- a/drivers/infiniband/ulp/iser/iscsi_iser.c
+++ b/drivers/infiniband/ulp/iser/iscsi_iser.c
@@ -82,10 +82,10 @@ module_param_named(max_lun, iscsi_max_lun, uint, S_IRUGO);
  
  int iser_debug_level = 0;
  
-MODULE_DESCRIPTION("iSER (iSCSI Extensions for RDMA) Datamover "
-                  "v" DRV_VER " (" DRV_DATE ")");
+MODULE_DESCRIPTION("iSER (iSCSI Extensions for RDMA) Datamover");
  MODULE_LICENSE("Dual BSD/GPL");
  MODULE_AUTHOR("Alex Nezhinsky, Dan Bar Dov, Or Gerlitz");
+MODULE_VERSION(DRV_VER);
  
  module_param_named(debug_level, iser_debug_level, int, 0644);
  MODULE_PARM_DESC(debug_level, "Enable debug tracing if > 0 (default:disabled)");
@@ -370,8 +370,8 @@ iscsi_iser_conn_bind(struct iscsi_cls_session *cls_session,
         /* binds the iSER connection retrieved from the previously
          * connected ep_handle to the iSCSI layer connection. exchanges
          * connection pointers */
-       iser_err("binding iscsi/iser conn %p %p to ib_conn %p\n",
-                                       conn, conn->dd_data, ib_conn);
+       iser_info("binding iscsi/iser conn %p %p to ib_conn %p\n",
+                 conn, conn->dd_data, ib_conn);
         iser_conn = conn->dd_data;
         ib_conn->iser_conn = iser_conn;
         iser_conn->ib_conn  = ib_conn;
@@ -475,28 +475,28 @@ iscsi_iser_set_param(struct iscsi_cls_conn *cls_conn,
         case ISCSI_PARAM_HDRDGST_EN:
                 sscanf(buf, "%d", &value);
                 if (value) {
-                       printk(KERN_ERR "DataDigest wasn't negotiated to None");
+                       iser_err("DataDigest wasn't negotiated to None");
                         return -EPROTO;
                 }
                 break;
         case ISCSI_PARAM_DATADGST_EN:
                 sscanf(buf, "%d", &value);
                 if (value) {
-                       printk(KERN_ERR "DataDigest wasn't negotiated to None");
+                       iser_err("DataDigest wasn't negotiated to None");
                         return -EPROTO;
                 }
                 break;
         case ISCSI_PARAM_IFMARKER_EN:
                 sscanf(buf, "%d", &value);
                 if (value) {
-                       printk(KERN_ERR "IFMarker wasn't negotiated to No");
+                       iser_err("IFMarker wasn't negotiated to No");
                         return -EPROTO;
                 }
                 break;
         case ISCSI_PARAM_OFMARKER_EN:
                 sscanf(buf, "%d", &value);
                 if (value) {
-                       printk(KERN_ERR "OFMarker wasn't negotiated to No");
+                       iser_err("OFMarker wasn't negotiated to No");
                         return -EPROTO;
                 }
                 break;
@@ -596,7 +596,7 @@ iscsi_iser_ep_poll(struct iscsi_endpoint *ep, int timeout_ms)
              ib_conn->state == ISER_CONN_DOWN))
                 rc = -1;
  
-       iser_err("ib conn %p rc = %d\n", ib_conn, rc);
+       iser_info("ib conn %p rc = %d\n", ib_conn, rc);
  
         if (rc > 0)
                 return 1; /* success, this is the equivalent of POLLOUT */
@@ -623,7 +623,7 @@ iscsi_iser_ep_disconnect(struct iscsi_endpoint *ep)
                 iscsi_suspend_tx(ib_conn->iser_conn->iscsi_conn);
  
  
-       iser_err("ib conn %p state %d\n",ib_conn, ib_conn->state);
+       iser_info("ib conn %p state %d\n", ib_conn, ib_conn->state);
         iser_conn_terminate(ib_conn);
  }
  
@@ -682,7 +682,7 @@ static umode_t iser_attr_is_visible(int param_type, int param)
  
  static struct scsi_host_template iscsi_iser_sht = {
         .module                 = THIS_MODULE,
-       .name                   = "iSCSI Initiator over iSER, v." DRV_VER,
+       .name                   = "iSCSI Initiator over iSER",
         .queuecommand           = iscsi_queuecommand,
         .change_queue_depth     = iscsi_change_queue_depth,
         .sg_tablesize           = ISCSI_ISER_SG_TABLESIZE,
@@ -740,7 +740,7 @@ static int __init iser_init(void)
         iser_dbg("Starting iSER datamover...\n");
  
         if (iscsi_max_lun < 1) {
-               printk(KERN_ERR "Invalid max_lun value of %u\n", iscsi_max_lun);
+               iser_err("Invalid max_lun value of %u\n", iscsi_max_lun);
                 return -EINVAL;
         }
  
diff --git a/drivers/infiniband/ulp/iser/iscsi_iser.h b/drivers/infiniband/ulp/iser/iscsi_iser.h

index 5babdb35bda76376e04d56d5ac3e5b9a2137c3e8..06f578cde75b002bbc935e45516d3bf1266c2ca3 100644 (file)
--- a/drivers/infiniband/ulp/iser/iscsi_iser.h
+++ b/drivers/infiniband/ulp/iser/iscsi_iser.h
@@ -42,6 +42,7 @@
  
  #include <linux/types.h>
  #include <linux/net.h>
+#include <linux/printk.h>
  #include <scsi/libiscsi.h>
  #include <scsi/scsi_transport_iscsi.h>
  
@@ -65,20 +66,26 @@
  
  #define DRV_NAME       "iser"
  #define PFX            DRV_NAME ": "
-#define DRV_VER                "0.1"
-#define DRV_DATE       "May 7th, 2006"
+#define DRV_VER                "1.1"
  
  #define iser_dbg(fmt, arg...)                          \
         do {                                            \
-               if (iser_debug_level > 1)               \
+               if (iser_debug_level > 2)               \
                         printk(KERN_DEBUG PFX "%s:" fmt,\
                                 __func__ , ## arg);     \
         } while (0)
  
  #define iser_warn(fmt, arg...)                         \
+       do {                                            \
+               if (iser_debug_level > 1)               \
+                       pr_warn(PFX "%s:" fmt,          \
+                               __func__ , ## arg);     \
+       } while (0)
+
+#define iser_info(fmt, arg...)                         \
         do {                                            \
                 if (iser_debug_level > 0)               \
-                       printk(KERN_DEBUG PFX "%s:" fmt,\
+                       pr_info(PFX "%s:" fmt,          \
                                 __func__ , ## arg);     \
         } while (0)
  
@@ -133,6 +140,15 @@ struct iser_hdr {
         __be64  read_va;
  } __attribute__((packed));
  
+
+#define ISER_ZBVA_NOT_SUPPORTED                0x80
+#define ISER_SEND_W_INV_NOT_SUPPORTED  0x40
+
+struct iser_cm_hdr {
+       u8      flags;
+       u8      rsvd[3];
+} __packed;
+
  /* Constant PDU lengths calculations */
  #define ISER_HEADERS_LEN  (sizeof(struct iser_hdr) + sizeof(struct iscsi_hdr))
  
diff --git a/drivers/infiniband/ulp/iser/iser_memory.c b/drivers/infiniband/ulp/iser/iser_memory.c

index be1edb04b085d92dc1da3eb0cd5b58079f5fc8d4..68ebb7fe072a0f347fa94e2e6572024deae4ebeb 100644 (file)
--- a/drivers/infiniband/ulp/iser/iser_memory.c
+++ b/drivers/infiniband/ulp/iser/iser_memory.c
@@ -416,8 +416,9 @@ int iser_reg_rdma_mem(struct iscsi_iser_task *iser_task,
                         for (i=0 ; i<ib_conn->page_vec->length ; i++)
                                 iser_err("page_vec[%d] = 0x%llx\n", i,
                                          (unsigned long long) ib_conn->page_vec->pages[i]);
-                       return err;
                 }
+               if (err)
+                       return err;
         }
         return 0;
  }
diff --git a/drivers/infiniband/ulp/iser/iser_verbs.c b/drivers/infiniband/ulp/iser/iser_verbs.c

index 4debadc53106f10422812b912125f6ae484c5e8e..5278916c31038cdea28ad9a524d2aa18d653e21d 100644 (file)
--- a/drivers/infiniband/ulp/iser/iser_verbs.c
+++ b/drivers/infiniband/ulp/iser/iser_verbs.c
@@ -74,8 +74,9 @@ static int iser_create_device_ib_res(struct iser_device *device)
         struct iser_cq_desc *cq_desc;
  
         device->cqs_used = min(ISER_MAX_CQ, device->ib_device->num_comp_vectors);
-       iser_err("using %d CQs, device %s supports %d vectors\n", device->cqs_used,
-                device->ib_device->name, device->ib_device->num_comp_vectors);
+       iser_info("using %d CQs, device %s supports %d vectors\n",
+                 device->cqs_used, device->ib_device->name,
+                 device->ib_device->num_comp_vectors);
  
         device->cq_desc = kmalloc(sizeof(struct iser_cq_desc) * device->cqs_used,
                                   GFP_KERNEL);
@@ -262,7 +263,7 @@ static int iser_create_ib_conn_res(struct iser_conn *ib_conn)
                         min_index = index;
         device->cq_active_qps[min_index]++;
         mutex_unlock(&ig.connlist_mutex);
-       iser_err("cq index %d used for ib_conn %p\n", min_index, ib_conn);
+       iser_info("cq index %d used for ib_conn %p\n", min_index, ib_conn);
  
         init_attr.event_handler = iser_qp_event_callback;
         init_attr.qp_context    = (void *)ib_conn;
@@ -280,9 +281,9 @@ static int iser_create_ib_conn_res(struct iser_conn *ib_conn)
                 goto out_err;
  
         ib_conn->qp = ib_conn->cma_id->qp;
-       iser_err("setting conn %p cma_id %p: fmr_pool %p qp %p\n",
-                ib_conn, ib_conn->cma_id,
-                ib_conn->fmr_pool, ib_conn->cma_id->qp);
+       iser_info("setting conn %p cma_id %p: fmr_pool %p qp %p\n",
+                 ib_conn, ib_conn->cma_id,
+                 ib_conn->fmr_pool, ib_conn->cma_id->qp);
         return ret;
  
  out_err:
@@ -299,9 +300,9 @@ static int iser_free_ib_conn_res(struct iser_conn *ib_conn, int can_destroy_id)
         int cq_index;
         BUG_ON(ib_conn == NULL);
  
-       iser_err("freeing conn %p cma_id %p fmr pool %p qp %p\n",
-                ib_conn, ib_conn->cma_id,
-                ib_conn->fmr_pool, ib_conn->qp);
+       iser_info("freeing conn %p cma_id %p fmr pool %p qp %p\n",
+                 ib_conn, ib_conn->cma_id,
+                 ib_conn->fmr_pool, ib_conn->qp);
  
         /* qp is created only once both addr & route are resolved */
         if (ib_conn->fmr_pool != NULL)
@@ -379,7 +380,7 @@ static void iser_device_try_release(struct iser_device *device)
  {
         mutex_lock(&ig.device_list_mutex);
         device->refcount--;
-       iser_err("device %p refcount %d\n",device,device->refcount);
+       iser_info("device %p refcount %d\n", device, device->refcount);
         if (!device->refcount) {
                 iser_free_device_ib_res(device);
                 list_del(&device->ig_list);
@@ -498,6 +499,7 @@ static int iser_route_handler(struct rdma_cm_id *cma_id)
  {
         struct rdma_conn_param conn_param;
         int    ret;
+       struct iser_cm_hdr req_hdr;
  
         ret = iser_create_ib_conn_res((struct iser_conn *)cma_id->context);
         if (ret)
@@ -509,6 +511,12 @@ static int iser_route_handler(struct rdma_cm_id *cma_id)
         conn_param.retry_count         = 7;
         conn_param.rnr_retry_count     = 6;
  
+       memset(&req_hdr, 0, sizeof(req_hdr));
+       req_hdr.flags = (ISER_ZBVA_NOT_SUPPORTED |
+                       ISER_SEND_W_INV_NOT_SUPPORTED);
+       conn_param.private_data         = (void *)&req_hdr;
+       conn_param.private_data_len     = sizeof(struct iser_cm_hdr);
+
         ret = rdma_connect(cma_id, &conn_param);
         if (ret) {
                 iser_err("failure connecting: %d\n", ret);
@@ -558,8 +566,8 @@ static int iser_cma_handler(struct rdma_cm_id *cma_id, struct rdma_cm_event *eve
  {
         int ret = 0;
  
-       iser_err("event %d status %d conn %p id %p\n",
-               event->event, event->status, cma_id->context, cma_id);
+       iser_info("event %d status %d conn %p id %p\n",
+                 event->event, event->status, cma_id->context, cma_id);
  
         switch (event->event) {
         case RDMA_CM_EVENT_ADDR_RESOLVED:
@@ -619,8 +627,8 @@ int iser_connect(struct iser_conn   *ib_conn,
         /* the device is known only --after-- address resolution */
         ib_conn->device = NULL;
  
-       iser_err("connecting to: %pI4, port 0x%x\n",
-                &dst_addr->sin_addr, dst_addr->sin_port);
+       iser_info("connecting to: %pI4, port 0x%x\n",
+                 &dst_addr->sin_addr, dst_addr->sin_port);
  
         ib_conn->state = ISER_CONN_PENDING;
  
diff --git a/drivers/infiniband/ulp/srpt/ib_srpt.c b/drivers/infiniband/ulp/srpt/ib_srpt.c

index c09d41b1a2ff59a663a3af484bb919a3ce4dffc2..b08ca7a9f76bf8ea70f63bfd6c291d4d71b5814d 100644 (file)
--- a/drivers/infiniband/ulp/srpt/ib_srpt.c
+++ b/drivers/infiniband/ulp/srpt/ib_srpt.c
@@ -1374,7 +1374,7 @@ static int srpt_abort_cmd(struct srpt_send_ioctx *ioctx)
                 target_put_sess_cmd(ioctx->ch->sess, &ioctx->cmd);
                 break;
         default:
-               WARN_ON("ERROR: unexpected command state");
+               WARN(1, "Unexpected command state (%d)", state);
                 break;
         }
  
diff --git a/drivers/input/keyboard/Kconfig b/drivers/input/keyboard/Kconfig

index 6a195d5e90ff6e3b62b463bf635cb5537334642e..62a2c0e4cc998774e1c12185b4f04369bbde4d28 100644 (file)
--- a/drivers/input/keyboard/Kconfig
+++ b/drivers/input/keyboard/Kconfig
@@ -175,7 +175,7 @@ config KEYBOARD_EP93XX
  
  config KEYBOARD_GPIO
         tristate "GPIO Buttons"
-       depends on GENERIC_GPIO
+       depends on GPIOLIB
         help
           This driver implements support for buttons connected
           to GPIO pins of various CPUs (and some other chips).
@@ -190,7 +190,7 @@ config KEYBOARD_GPIO
  
  config KEYBOARD_GPIO_POLLED
         tristate "Polled GPIO buttons"
-       depends on GENERIC_GPIO
+       depends on GPIOLIB
         select INPUT_POLLDEV
         help
           This driver implements support for buttons connected
@@ -241,7 +241,7 @@ config KEYBOARD_TCA8418
  
  config KEYBOARD_MATRIX
         tristate "GPIO driven matrix keypad support"
-       depends on GENERIC_GPIO
+       depends on GPIOLIB
         select INPUT_MATRIXKMAP
         help
           Enable support for GPIO driven matrix keypad.
diff --git a/drivers/input/misc/Kconfig b/drivers/input/misc/Kconfig

index af80928a46b43a14ce54b325ab07094848797bc6..bb698e1f9e429b1c5d6dee2b59f0603807918d84 100644 (file)
--- a/drivers/input/misc/Kconfig
+++ b/drivers/input/misc/Kconfig
@@ -214,7 +214,7 @@ config INPUT_APANEL
  config INPUT_GP2A
         tristate "Sharp GP2AP002A00F I2C Proximity/Opto sensor driver"
         depends on I2C
-       depends on GENERIC_GPIO
+       depends on GPIOLIB
         help
           Say Y here if you have a Sharp GP2AP002A00F proximity/als combo-chip
           hooked to an I2C bus.
@@ -224,7 +224,7 @@ config INPUT_GP2A
  
  config INPUT_GPIO_TILT_POLLED
         tristate "Polled GPIO tilt switch"
-       depends on GENERIC_GPIO
+       depends on GPIOLIB
         select INPUT_POLLDEV
         help
           This driver implements support for tilt switches connected
@@ -472,7 +472,7 @@ config INPUT_PWM_BEEPER
  
  config INPUT_GPIO_ROTARY_ENCODER
         tristate "Rotary encoders connected to GPIO pins"
-       depends on GPIOLIB && GENERIC_GPIO
+       depends on GPIOLIB
         help
           Say Y here to add support for rotary encoders connected to GPIO lines.
           Check file:Documentation/input/rotary-encoder.txt for more
@@ -484,7 +484,7 @@ config INPUT_GPIO_ROTARY_ENCODER
  config INPUT_RB532_BUTTON
         tristate "Mikrotik Routerboard 532 button interface"
         depends on MIKROTIK_RB532
-       depends on GPIOLIB && GENERIC_GPIO
+       depends on GPIOLIB
         select INPUT_POLLDEV
         help
           Say Y here if you want support for the S1 button built into
diff --git a/drivers/input/mouse/Kconfig b/drivers/input/mouse/Kconfig

index 802bd6a72d736422ec21e2c4d655b78635d014cf..effa9c5f2c5cc6043f0ca988261b02f516c85be6 100644 (file)
--- a/drivers/input/mouse/Kconfig
+++ b/drivers/input/mouse/Kconfig
@@ -295,7 +295,7 @@ config MOUSE_VSXXXAA
  
  config MOUSE_GPIO
         tristate "GPIO mouse"
-       depends on GENERIC_GPIO
+       depends on GPIOLIB
         select INPUT_POLLDEV
         help
           This driver simulates a mouse on GPIO lines of various CPUs (and some
diff --git a/drivers/leds/Kconfig b/drivers/leds/Kconfig

index d44806d41b4468838fdd151da7dea61fa2e54b14..ef992293598a6d71296c28209e116cbbea75b026 100644 (file)
--- a/drivers/leds/Kconfig
+++ b/drivers/leds/Kconfig
@@ -173,7 +173,7 @@ config LEDS_PCA9532_GPIO
  config LEDS_GPIO
         tristate "LED Support for GPIO connected LEDs"
         depends on LEDS_CLASS
-       depends on GENERIC_GPIO
+       depends on GPIOLIB
         help
           This option enables support for the LEDs connected to GPIO
           outputs. To be useful the particular board must have LEDs
@@ -362,7 +362,7 @@ config LEDS_INTEL_SS4200
  config LEDS_LT3593
         tristate "LED driver for LT3593 controllers"
         depends on LEDS_CLASS
-       depends on GENERIC_GPIO
+       depends on GPIOLIB
         help
           This option enables support for LEDs driven by a Linear Technology
           LT3593 controller. This controller uses a special one-wire pulse
@@ -431,7 +431,7 @@ config LEDS_ASIC3
  
  config LEDS_RENESAS_TPU
         bool "LED support for Renesas TPU"
-       depends on LEDS_CLASS=y && HAVE_CLK && GENERIC_GPIO
+       depends on LEDS_CLASS=y && HAVE_CLK && GPIOLIB
         help
           This option enables build of the LED TPU platform driver,
           suitable to drive any TPU channel on newer Renesas SoCs.
diff --git a/drivers/md/Kconfig b/drivers/md/Kconfig

index 4d8d90b4fe7812ea5169619d3945b0722d46f1ec..3bfc8f1da9fe75daebda0ca2c5cad26dc4a9000a 100644 (file)
--- a/drivers/md/Kconfig
+++ b/drivers/md/Kconfig
@@ -174,6 +174,8 @@ config MD_FAULTY
  
           In unsure, say N.
  
+source "drivers/md/bcache/Kconfig"
+
  config BLK_DEV_DM
         tristate "Device mapper support"
         ---help---
diff --git a/drivers/md/Makefile b/drivers/md/Makefile

index 7ceeaefc0e95d0fa8b75a6bdc2c2a5846f3144fa..1439fd4ad9b1ae95e9cb7e25f40172535302222d 100644 (file)
--- a/drivers/md/Makefile
+++ b/drivers/md/Makefile
@@ -29,6 +29,7 @@ obj-$(CONFIG_MD_RAID10)               += raid10.o
  obj-$(CONFIG_MD_RAID456)       += raid456.o
  obj-$(CONFIG_MD_MULTIPATH)     += multipath.o
  obj-$(CONFIG_MD_FAULTY)                += faulty.o
+obj-$(CONFIG_BCACHE)           += bcache/
  obj-$(CONFIG_BLK_DEV_MD)       += md-mod.o
  obj-$(CONFIG_BLK_DEV_DM)       += dm-mod.o
  obj-$(CONFIG_DM_BUFIO)         += dm-bufio.o
diff --git a/drivers/md/bcache/Kconfig b/drivers/md/bcache/Kconfig

new file mode 100644 (file)

index 0000000..05c220d
--- /dev/null
+++ b/drivers/md/bcache/Kconfig
@@ -0,0 +1,42 @@
+
+config BCACHE
+       tristate "Block device as cache"
+       select CLOSURES
+       ---help---
+       Allows a block device to be used as cache for other devices; uses
+       a btree for indexing and the layout is optimized for SSDs.
+
+       See Documentation/bcache.txt for details.
+
+config BCACHE_DEBUG
+       bool "Bcache debugging"
+       depends on BCACHE
+       ---help---
+       Don't select this option unless you're a developer
+
+       Enables extra debugging tools (primarily a fuzz tester)
+
+config BCACHE_EDEBUG
+       bool "Extended runtime checks"
+       depends on BCACHE
+       ---help---
+       Don't select this option unless you're a developer
+
+       Enables extra runtime checks which significantly affect performance
+
+config BCACHE_CLOSURES_DEBUG
+       bool "Debug closures"
+       depends on BCACHE
+       select DEBUG_FS
+       ---help---
+       Keeps all active closures in a linked list and provides a debugfs
+       interface to list them, which makes it possible to see asynchronous
+       operations that get stuck.
+
+# cgroup code needs to be updated:
+#
+#config CGROUP_BCACHE
+#      bool "Cgroup controls for bcache"
+#      depends on BCACHE && BLK_CGROUP
+#      ---help---
+#      TODO
diff --git a/drivers/md/bcache/Makefile b/drivers/md/bcache/Makefile

new file mode 100644 (file)

index 0000000..0e9c825
--- /dev/null
+++ b/drivers/md/bcache/Makefile
@@ -0,0 +1,7 @@
+
+obj-$(CONFIG_BCACHE)   += bcache.o
+
+bcache-y               := alloc.o btree.o bset.o io.o journal.o writeback.o\
+       movinggc.o request.o super.o sysfs.o debug.o util.o trace.o stats.o closure.o
+
+CFLAGS_request.o       += -Iblock
diff --git a/drivers/md/bcache/alloc.c b/drivers/md/bcache/alloc.c

new file mode 100644 (file)

index 0000000..048f294
--- /dev/null
+++ b/drivers/md/bcache/alloc.c
@@ -0,0 +1,599 @@
+/*
+ * Primary bucket allocation code
+ *
+ * Copyright 2012 Google, Inc.
+ *
+ * Allocation in bcache is done in terms of buckets:
+ *
+ * Each bucket has associated an 8 bit gen; this gen corresponds to the gen in
+ * btree pointers - they must match for the pointer to be considered valid.
+ *
+ * Thus (assuming a bucket has no dirty data or metadata in it) we can reuse a
+ * bucket simply by incrementing its gen.
+ *
+ * The gens (along with the priorities; it's really the gens are important but
+ * the code is named as if it's the priorities) are written in an arbitrary list
+ * of buckets on disk, with a pointer to them in the journal header.
+ *
+ * When we invalidate a bucket, we have to write its new gen to disk and wait
+ * for that write to complete before we use it - otherwise after a crash we
+ * could have pointers that appeared to be good but pointed to data that had
+ * been overwritten.
+ *
+ * Since the gens and priorities are all stored contiguously on disk, we can
+ * batch this up: We fill up the free_inc list with freshly invalidated buckets,
+ * call prio_write(), and when prio_write() finishes we pull buckets off the
+ * free_inc list and optionally discard them.
+ *
+ * free_inc isn't the only freelist - if it was, we'd often to sleep while
+ * priorities and gens were being written before we could allocate. c->free is a
+ * smaller freelist, and buckets on that list are always ready to be used.
+ *
+ * If we've got discards enabled, that happens when a bucket moves from the
+ * free_inc list to the free list.
+ *
+ * There is another freelist, because sometimes we have buckets that we know
+ * have nothing pointing into them - these we can reuse without waiting for
+ * priorities to be rewritten. These come from freed btree nodes and buckets
+ * that garbage collection discovered no longer had valid keys pointing into
+ * them (because they were overwritten). That's the unused list - buckets on the
+ * unused list move to the free list, optionally being discarded in the process.
+ *
+ * It's also important to ensure that gens don't wrap around - with respect to
+ * either the oldest gen in the btree or the gen on disk. This is quite
+ * difficult to do in practice, but we explicitly guard against it anyways - if
+ * a bucket is in danger of wrapping around we simply skip invalidating it that
+ * time around, and we garbage collect or rewrite the priorities sooner than we
+ * would have otherwise.
+ *
+ * bch_bucket_alloc() allocates a single bucket from a specific cache.
+ *
+ * bch_bucket_alloc_set() allocates one or more buckets from different caches
+ * out of a cache set.
+ *
+ * free_some_buckets() drives all the processes described above. It's called
+ * from bch_bucket_alloc() and a few other places that need to make sure free
+ * buckets are ready.
+ *
+ * invalidate_buckets_(lru|fifo)() find buckets that are available to be
+ * invalidated, and then invalidate them and stick them on the free_inc list -
+ * in either lru or fifo order.
+ */
+
+#include "bcache.h"
+#include "btree.h"
+
+#include <linux/random.h>
+
+#define MAX_IN_FLIGHT_DISCARDS         8U
+
+/* Bucket heap / gen */
+
+uint8_t bch_inc_gen(struct cache *ca, struct bucket *b)
+{
+       uint8_t ret = ++b->gen;
+
+       ca->set->need_gc = max(ca->set->need_gc, bucket_gc_gen(b));
+       WARN_ON_ONCE(ca->set->need_gc > BUCKET_GC_GEN_MAX);
+
+       if (CACHE_SYNC(&ca->set->sb)) {
+               ca->need_save_prio = max(ca->need_save_prio,
+                                        bucket_disk_gen(b));
+               WARN_ON_ONCE(ca->need_save_prio > BUCKET_DISK_GEN_MAX);
+       }
+
+       return ret;
+}
+
+void bch_rescale_priorities(struct cache_set *c, int sectors)
+{
+       struct cache *ca;
+       struct bucket *b;
+       unsigned next = c->nbuckets * c->sb.bucket_size / 1024;
+       unsigned i;
+       int r;
+
+       atomic_sub(sectors, &c->rescale);
+
+       do {
+               r = atomic_read(&c->rescale);
+
+               if (r >= 0)
+                       return;
+       } while (atomic_cmpxchg(&c->rescale, r, r + next) != r);
+
+       mutex_lock(&c->bucket_lock);
+
+       c->min_prio = USHRT_MAX;
+
+       for_each_cache(ca, c, i)
+               for_each_bucket(b, ca)
+                       if (b->prio &&
+                           b->prio != BTREE_PRIO &&
+                           !atomic_read(&b->pin)) {
+                               b->prio--;
+                               c->min_prio = min(c->min_prio, b->prio);
+                       }
+
+       mutex_unlock(&c->bucket_lock);
+}
+
+/* Discard/TRIM */
+
+struct discard {
+       struct list_head        list;
+       struct work_struct      work;
+       struct cache            *ca;
+       long                    bucket;
+
+       struct bio              bio;
+       struct bio_vec          bv;
+};
+
+static void discard_finish(struct work_struct *w)
+{
+       struct discard *d = container_of(w, struct discard, work);
+       struct cache *ca = d->ca;
+       char buf[BDEVNAME_SIZE];
+
+       if (!test_bit(BIO_UPTODATE, &d->bio.bi_flags)) {
+               pr_notice("discard error on %s, disabling",
+                        bdevname(ca->bdev, buf));
+               d->ca->discard = 0;
+       }
+
+       mutex_lock(&ca->set->bucket_lock);
+
+       fifo_push(&ca->free, d->bucket);
+       list_add(&d->list, &ca->discards);
+       atomic_dec(&ca->discards_in_flight);
+
+       mutex_unlock(&ca->set->bucket_lock);
+
+       closure_wake_up(&ca->set->bucket_wait);
+       wake_up(&ca->set->alloc_wait);
+
+       closure_put(&ca->set->cl);
+}
+
+static void discard_endio(struct bio *bio, int error)
+{
+       struct discard *d = container_of(bio, struct discard, bio);
+       schedule_work(&d->work);
+}
+
+static void do_discard(struct cache *ca, long bucket)
+{
+       struct discard *d = list_first_entry(&ca->discards,
+                                            struct discard, list);
+
+       list_del(&d->list);
+       d->bucket = bucket;
+
+       atomic_inc(&ca->discards_in_flight);
+       closure_get(&ca->set->cl);
+
+       bio_init(&d->bio);
+
+       d->bio.bi_sector        = bucket_to_sector(ca->set, d->bucket);
+       d->bio.bi_bdev          = ca->bdev;
+       d->bio.bi_rw            = REQ_WRITE|REQ_DISCARD;
+       d->bio.bi_max_vecs      = 1;
+       d->bio.bi_io_vec        = d->bio.bi_inline_vecs;
+       d->bio.bi_size          = bucket_bytes(ca);
+       d->bio.bi_end_io        = discard_endio;
+       bio_set_prio(&d->bio, IOPRIO_PRIO_VALUE(IOPRIO_CLASS_IDLE, 0));
+
+       submit_bio(0, &d->bio);
+}
+
+/* Allocation */
+
+static inline bool can_inc_bucket_gen(struct bucket *b)
+{
+       return bucket_gc_gen(b) < BUCKET_GC_GEN_MAX &&
+               bucket_disk_gen(b) < BUCKET_DISK_GEN_MAX;
+}
+
+bool bch_bucket_add_unused(struct cache *ca, struct bucket *b)
+{
+       BUG_ON(GC_MARK(b) || GC_SECTORS_USED(b));
+
+       if (fifo_used(&ca->free) > ca->watermark[WATERMARK_MOVINGGC] &&
+           CACHE_REPLACEMENT(&ca->sb) == CACHE_REPLACEMENT_FIFO)
+               return false;
+
+       b->prio = 0;
+
+       if (can_inc_bucket_gen(b) &&
+           fifo_push(&ca->unused, b - ca->buckets)) {
+               atomic_inc(&b->pin);
+               return true;
+       }
+
+       return false;
+}
+
+static bool can_invalidate_bucket(struct cache *ca, struct bucket *b)
+{
+       return GC_MARK(b) == GC_MARK_RECLAIMABLE &&
+               !atomic_read(&b->pin) &&
+               can_inc_bucket_gen(b);
+}
+
+static void invalidate_one_bucket(struct cache *ca, struct bucket *b)
+{
+       bch_inc_gen(ca, b);
+       b->prio = INITIAL_PRIO;
+       atomic_inc(&b->pin);
+       fifo_push(&ca->free_inc, b - ca->buckets);
+}
+
+#define bucket_prio(b)                         \
+       (((unsigned) (b->prio - ca->set->min_prio)) * GC_SECTORS_USED(b))
+
+#define bucket_max_cmp(l, r)   (bucket_prio(l) < bucket_prio(r))
+#define bucket_min_cmp(l, r)   (bucket_prio(l) > bucket_prio(r))
+
+static void invalidate_buckets_lru(struct cache *ca)
+{
+       struct bucket *b;
+       ssize_t i;
+
+       ca->heap.used = 0;
+
+       for_each_bucket(b, ca) {
+               /*
+                * If we fill up the unused list, if we then return before
+                * adding anything to the free_inc list we'll skip writing
+                * prios/gens and just go back to allocating from the unused
+                * list:
+                */
+               if (fifo_full(&ca->unused))
+                       return;
+
+               if (!can_invalidate_bucket(ca, b))
+                       continue;
+
+               if (!GC_SECTORS_USED(b) &&
+                   bch_bucket_add_unused(ca, b))
+                       continue;
+
+               if (!heap_full(&ca->heap))
+                       heap_add(&ca->heap, b, bucket_max_cmp);
+               else if (bucket_max_cmp(b, heap_peek(&ca->heap))) {
+                       ca->heap.data[0] = b;
+                       heap_sift(&ca->heap, 0, bucket_max_cmp);
+               }
+       }
+
+       for (i = ca->heap.used / 2 - 1; i >= 0; --i)
+               heap_sift(&ca->heap, i, bucket_min_cmp);
+
+       while (!fifo_full(&ca->free_inc)) {
+               if (!heap_pop(&ca->heap, b, bucket_min_cmp)) {
+                       /*
+                        * We don't want to be calling invalidate_buckets()
+                        * multiple times when it can't do anything
+                        */
+                       ca->invalidate_needs_gc = 1;
+                       bch_queue_gc(ca->set);
+                       return;
+               }
+
+               invalidate_one_bucket(ca, b);
+       }
+}
+
+static void invalidate_buckets_fifo(struct cache *ca)
+{
+       struct bucket *b;
+       size_t checked = 0;
+
+       while (!fifo_full(&ca->free_inc)) {
+               if (ca->fifo_last_bucket <  ca->sb.first_bucket ||
+                   ca->fifo_last_bucket >= ca->sb.nbuckets)
+                       ca->fifo_last_bucket = ca->sb.first_bucket;
+
+               b = ca->buckets + ca->fifo_last_bucket++;
+
+               if (can_invalidate_bucket(ca, b))
+                       invalidate_one_bucket(ca, b);
+
+               if (++checked >= ca->sb.nbuckets) {
+                       ca->invalidate_needs_gc = 1;
+                       bch_queue_gc(ca->set);
+                       return;
+               }
+       }
+}
+
+static void invalidate_buckets_random(struct cache *ca)
+{
+       struct bucket *b;
+       size_t checked = 0;
+
+       while (!fifo_full(&ca->free_inc)) {
+               size_t n;
+               get_random_bytes(&n, sizeof(n));
+
+               n %= (size_t) (ca->sb.nbuckets - ca->sb.first_bucket);
+               n += ca->sb.first_bucket;
+
+               b = ca->buckets + n;
+
+               if (can_invalidate_bucket(ca, b))
+                       invalidate_one_bucket(ca, b);
+
+               if (++checked >= ca->sb.nbuckets / 2) {
+                       ca->invalidate_needs_gc = 1;
+                       bch_queue_gc(ca->set);
+                       return;
+               }
+       }
+}
+
+static void invalidate_buckets(struct cache *ca)
+{
+       if (ca->invalidate_needs_gc)
+               return;
+
+       switch (CACHE_REPLACEMENT(&ca->sb)) {
+       case CACHE_REPLACEMENT_LRU:
+               invalidate_buckets_lru(ca);
+               break;
+       case CACHE_REPLACEMENT_FIFO:
+               invalidate_buckets_fifo(ca);
+               break;
+       case CACHE_REPLACEMENT_RANDOM:
+               invalidate_buckets_random(ca);
+               break;
+       }
+
+       pr_debug("free %zu/%zu free_inc %zu/%zu unused %zu/%zu",
+                fifo_used(&ca->free), ca->free.size,
+                fifo_used(&ca->free_inc), ca->free_inc.size,
+                fifo_used(&ca->unused), ca->unused.size);
+}
+
+#define allocator_wait(ca, cond)                                       \
+do {                                                                   \
+       DEFINE_WAIT(__wait);                                            \
+                                                                       \
+       while (1) {                                                     \
+               prepare_to_wait(&ca->set->alloc_wait,                   \
+                               &__wait, TASK_INTERRUPTIBLE);           \
+               if (cond)                                               \
+                       break;                                          \
+                                                                       \
+               mutex_unlock(&(ca)->set->bucket_lock);                  \
+               if (test_bit(CACHE_SET_STOPPING_2, &ca->set->flags)) {  \
+                       finish_wait(&ca->set->alloc_wait, &__wait);     \
+                       closure_return(cl);                             \
+               }                                                       \
+                                                                       \
+               schedule();                                             \
+               mutex_lock(&(ca)->set->bucket_lock);                    \
+       }                                                               \
+                                                                       \
+       finish_wait(&ca->set->alloc_wait, &__wait);                     \
+} while (0)
+
+void bch_allocator_thread(struct closure *cl)
+{
+       struct cache *ca = container_of(cl, struct cache, alloc);
+
+       mutex_lock(&ca->set->bucket_lock);
+
+       while (1) {
+               /*
+                * First, we pull buckets off of the unused and free_inc lists,
+                * possibly issue discards to them, then we add the bucket to
+                * the free list:
+                */
+               while (1) {
+                       long bucket;
+
+                       if ((!atomic_read(&ca->set->prio_blocked) ||
+                            !CACHE_SYNC(&ca->set->sb)) &&
+                           !fifo_empty(&ca->unused))
+                               fifo_pop(&ca->unused, bucket);
+                       else if (!fifo_empty(&ca->free_inc))
+                               fifo_pop(&ca->free_inc, bucket);
+                       else
+                               break;
+
+                       allocator_wait(ca, (int) fifo_free(&ca->free) >
+                                      atomic_read(&ca->discards_in_flight));
+
+                       if (ca->discard) {
+                               allocator_wait(ca, !list_empty(&ca->discards));
+                               do_discard(ca, bucket);
+                       } else {
+                               fifo_push(&ca->free, bucket);
+                               closure_wake_up(&ca->set->bucket_wait);
+                       }
+               }
+
+               /*
+                * We've run out of free buckets, we need to find some buckets
+                * we can invalidate. First, invalidate them in memory and add
+                * them to the free_inc list:
+                */
+
+               allocator_wait(ca, ca->set->gc_mark_valid &&
+                              (ca->need_save_prio > 64 ||
+                               !ca->invalidate_needs_gc));
+               invalidate_buckets(ca);
+
+               /*
+                * Now, we write their new gens to disk so we can start writing
+                * new stuff to them:
+                */
+               allocator_wait(ca, !atomic_read(&ca->set->prio_blocked));
+               if (CACHE_SYNC(&ca->set->sb) &&
+                   (!fifo_empty(&ca->free_inc) ||
+                    ca->need_save_prio > 64))
+                       bch_prio_write(ca);
+       }
+}
+
+long bch_bucket_alloc(struct cache *ca, unsigned watermark, struct closure *cl)
+{
+       long r = -1;
+again:
+       wake_up(&ca->set->alloc_wait);
+
+       if (fifo_used(&ca->free) > ca->watermark[watermark] &&
+           fifo_pop(&ca->free, r)) {
+               struct bucket *b = ca->buckets + r;
+#ifdef CONFIG_BCACHE_EDEBUG
+               size_t iter;
+               long i;
+
+               for (iter = 0; iter < prio_buckets(ca) * 2; iter++)
+                       BUG_ON(ca->prio_buckets[iter] == (uint64_t) r);
+
+               fifo_for_each(i, &ca->free, iter)
+                       BUG_ON(i == r);
+               fifo_for_each(i, &ca->free_inc, iter)
+                       BUG_ON(i == r);
+               fifo_for_each(i, &ca->unused, iter)
+                       BUG_ON(i == r);
+#endif
+               BUG_ON(atomic_read(&b->pin) != 1);
+
+               SET_GC_SECTORS_USED(b, ca->sb.bucket_size);
+
+               if (watermark <= WATERMARK_METADATA) {
+                       SET_GC_MARK(b, GC_MARK_METADATA);
+                       b->prio = BTREE_PRIO;
+               } else {
+                       SET_GC_MARK(b, GC_MARK_RECLAIMABLE);
+                       b->prio = INITIAL_PRIO;
+               }
+
+               return r;
+       }
+
+       pr_debug("alloc failure: blocked %i free %zu free_inc %zu unused %zu",
+                atomic_read(&ca->set->prio_blocked), fifo_used(&ca->free),
+                fifo_used(&ca->free_inc), fifo_used(&ca->unused));
+
+       if (cl) {
+               closure_wait(&ca->set->bucket_wait, cl);
+
+               if (closure_blocking(cl)) {
+                       mutex_unlock(&ca->set->bucket_lock);
+                       closure_sync(cl);
+                       mutex_lock(&ca->set->bucket_lock);
+                       goto again;
+               }
+       }
+
+       return -1;
+}
+
+void bch_bucket_free(struct cache_set *c, struct bkey *k)
+{
+       unsigned i;
+
+       for (i = 0; i < KEY_PTRS(k); i++) {
+               struct bucket *b = PTR_BUCKET(c, k, i);
+
+               SET_GC_MARK(b, GC_MARK_RECLAIMABLE);
+               SET_GC_SECTORS_USED(b, 0);
+               bch_bucket_add_unused(PTR_CACHE(c, k, i), b);
+       }
+}
+
+int __bch_bucket_alloc_set(struct cache_set *c, unsigned watermark,
+                          struct bkey *k, int n, struct closure *cl)
+{
+       int i;
+
+       lockdep_assert_held(&c->bucket_lock);
+       BUG_ON(!n || n > c->caches_loaded || n > 8);
+
+       bkey_init(k);
+
+       /* sort by free space/prio of oldest data in caches */
+
+       for (i = 0; i < n; i++) {
+               struct cache *ca = c->cache_by_alloc[i];
+               long b = bch_bucket_alloc(ca, watermark, cl);
+
+               if (b == -1)
+                       goto err;
+
+               k->ptr[i] = PTR(ca->buckets[b].gen,
+                               bucket_to_sector(c, b),
+                               ca->sb.nr_this_dev);
+
+               SET_KEY_PTRS(k, i + 1);
+       }
+
+       return 0;
+err:
+       bch_bucket_free(c, k);
+       __bkey_put(c, k);
+       return -1;
+}
+
+int bch_bucket_alloc_set(struct cache_set *c, unsigned watermark,
+                        struct bkey *k, int n, struct closure *cl)
+{
+       int ret;
+       mutex_lock(&c->bucket_lock);
+       ret = __bch_bucket_alloc_set(c, watermark, k, n, cl);
+       mutex_unlock(&c->bucket_lock);
+       return ret;
+}
+
+/* Init */
+
+void bch_cache_allocator_exit(struct cache *ca)
+{
+       struct discard *d;
+
+       while (!list_empty(&ca->discards)) {
+               d = list_first_entry(&ca->discards, struct discard, list);
+               cancel_work_sync(&d->work);
+               list_del(&d->list);
+               kfree(d);
+       }
+}
+
+int bch_cache_allocator_init(struct cache *ca)
+{
+       unsigned i;
+
+       /*
+        * Reserve:
+        * Prio/gen writes first
+        * Then 8 for btree allocations
+        * Then half for the moving garbage collector
+        */
+
+       ca->watermark[WATERMARK_PRIO] = 0;
+
+       ca->watermark[WATERMARK_METADATA] = prio_buckets(ca);
+
+       ca->watermark[WATERMARK_MOVINGGC] = 8 +
+               ca->watermark[WATERMARK_METADATA];
+
+       ca->watermark[WATERMARK_NONE] = ca->free.size / 2 +
+               ca->watermark[WATERMARK_MOVINGGC];
+
+       for (i = 0; i < MAX_IN_FLIGHT_DISCARDS; i++) {
+               struct discard *d = kzalloc(sizeof(*d), GFP_KERNEL);
+               if (!d)
+                       return -ENOMEM;
+
+               d->ca = ca;
+               INIT_WORK(&d->work, discard_finish);
+               list_add(&d->list, &ca->discards);
+       }
+
+       return 0;
+}
diff --git a/drivers/md/bcache/bcache.h b/drivers/md/bcache/bcache.h

new file mode 100644 (file)

index 0000000..340146d
--- /dev/null
+++ b/drivers/md/bcache/bcache.h
@@ -0,0 +1,1259 @@
+#ifndef _BCACHE_H
+#define _BCACHE_H
+
+/*
+ * SOME HIGH LEVEL CODE DOCUMENTATION:
+ *
+ * Bcache mostly works with cache sets, cache devices, and backing devices.
+ *
+ * Support for multiple cache devices hasn't quite been finished off yet, but
+ * it's about 95% plumbed through. A cache set and its cache devices is sort of
+ * like a md raid array and its component devices. Most of the code doesn't care
+ * about individual cache devices, the main abstraction is the cache set.
+ *
+ * Multiple cache devices is intended to give us the ability to mirror dirty
+ * cached data and metadata, without mirroring clean cached data.
+ *
+ * Backing devices are different, in that they have a lifetime independent of a
+ * cache set. When you register a newly formatted backing device it'll come up
+ * in passthrough mode, and then you can attach and detach a backing device from
+ * a cache set at runtime - while it's mounted and in use. Detaching implicitly
+ * invalidates any cached data for that backing device.
+ *
+ * A cache set can have multiple (many) backing devices attached to it.
+ *
+ * There's also flash only volumes - this is the reason for the distinction
+ * between struct cached_dev and struct bcache_device. A flash only volume
+ * works much like a bcache device that has a backing device, except the
+ * "cached" data is always dirty. The end result is that we get thin
+ * provisioning with very little additional code.
+ *
+ * Flash only volumes work but they're not production ready because the moving
+ * garbage collector needs more work. More on that later.
+ *
+ * BUCKETS/ALLOCATION:
+ *
+ * Bcache is primarily designed for caching, which means that in normal
+ * operation all of our available space will be allocated. Thus, we need an
+ * efficient way of deleting things from the cache so we can write new things to
+ * it.
+ *
+ * To do this, we first divide the cache device up into buckets. A bucket is the
+ * unit of allocation; they're typically around 1 mb - anywhere from 128k to 2M+
+ * works efficiently.
+ *
+ * Each bucket has a 16 bit priority, and an 8 bit generation associated with
+ * it. The gens and priorities for all the buckets are stored contiguously and
+ * packed on disk (in a linked list of buckets - aside from the superblock, all
+ * of bcache's metadata is stored in buckets).
+ *
+ * The priority is used to implement an LRU. We reset a bucket's priority when
+ * we allocate it or on cache it, and every so often we decrement the priority
+ * of each bucket. It could be used to implement something more sophisticated,
+ * if anyone ever gets around to it.
+ *
+ * The generation is used for invalidating buckets. Each pointer also has an 8
+ * bit generation embedded in it; for a pointer to be considered valid, its gen
+ * must match the gen of the bucket it points into.  Thus, to reuse a bucket all
+ * we have to do is increment its gen (and write its new gen to disk; we batch
+ * this up).
+ *
+ * Bcache is entirely COW - we never write twice to a bucket, even buckets that
+ * contain metadata (including btree nodes).
+ *
+ * THE BTREE:
+ *
+ * Bcache is in large part design around the btree.
+ *
+ * At a high level, the btree is just an index of key -> ptr tuples.
+ *
+ * Keys represent extents, and thus have a size field. Keys also have a variable
+ * number of pointers attached to them (potentially zero, which is handy for
+ * invalidating the cache).
+ *
+ * The key itself is an inode:offset pair. The inode number corresponds to a
+ * backing device or a flash only volume. The offset is the ending offset of the
+ * extent within the inode - not the starting offset; this makes lookups
+ * slightly more convenient.
+ *
+ * Pointers contain the cache device id, the offset on that device, and an 8 bit
+ * generation number. More on the gen later.
+ *
+ * Index lookups are not fully abstracted - cache lookups in particular are
+ * still somewhat mixed in with the btree code, but things are headed in that
+ * direction.
+ *
+ * Updates are fairly well abstracted, though. There are two different ways of
+ * updating the btree; insert and replace.
+ *
+ * BTREE_INSERT will just take a list of keys and insert them into the btree -
+ * overwriting (possibly only partially) any extents they overlap with. This is
+ * used to update the index after a write.
+ *
+ * BTREE_REPLACE is really cmpxchg(); it inserts a key into the btree iff it is
+ * overwriting a key that matches another given key. This is used for inserting
+ * data into the cache after a cache miss, and for background writeback, and for
+ * the moving garbage collector.
+ *
+ * There is no "delete" operation; deleting things from the index is
+ * accomplished by either by invalidating pointers (by incrementing a bucket's
+ * gen) or by inserting a key with 0 pointers - which will overwrite anything
+ * previously present at that location in the index.
+ *
+ * This means that there are always stale/invalid keys in the btree. They're
+ * filtered out by the code that iterates through a btree node, and removed when
+ * a btree node is rewritten.
+ *
+ * BTREE NODES:
+ *
+ * Our unit of allocation is a bucket, and we we can't arbitrarily allocate and
+ * free smaller than a bucket - so, that's how big our btree nodes are.
+ *
+ * (If buckets are really big we'll only use part of the bucket for a btree node
+ * - no less than 1/4th - but a bucket still contains no more than a single
+ * btree node. I'd actually like to change this, but for now we rely on the
+ * bucket's gen for deleting btree nodes when we rewrite/split a node.)
+ *
+ * Anyways, btree nodes are big - big enough to be inefficient with a textbook
+ * btree implementation.
+ *
+ * The way this is solved is that btree nodes are internally log structured; we
+ * can append new keys to an existing btree node without rewriting it. This
+ * means each set of keys we write is sorted, but the node is not.
+ *
+ * We maintain this log structure in memory - keeping 1Mb of keys sorted would
+ * be expensive, and we have to distinguish between the keys we have written and
+ * the keys we haven't. So to do a lookup in a btree node, we have to search
+ * each sorted set. But we do merge written sets together lazily, so the cost of
+ * these extra searches is quite low (normally most of the keys in a btree node
+ * will be in one big set, and then there'll be one or two sets that are much
+ * smaller).
+ *
+ * This log structure makes bcache's btree more of a hybrid between a
+ * conventional btree and a compacting data structure, with some of the
+ * advantages of both.
+ *
+ * GARBAGE COLLECTION:
+ *
+ * We can't just invalidate any bucket - it might contain dirty data or
+ * metadata. If it once contained dirty data, other writes might overwrite it
+ * later, leaving no valid pointers into that bucket in the index.
+ *
+ * Thus, the primary purpose of garbage collection is to find buckets to reuse.
+ * It also counts how much valid data it each bucket currently contains, so that
+ * allocation can reuse buckets sooner when they've been mostly overwritten.
+ *
+ * It also does some things that are really internal to the btree
+ * implementation. If a btree node contains pointers that are stale by more than
+ * some threshold, it rewrites the btree node to avoid the bucket's generation
+ * wrapping around. It also merges adjacent btree nodes if they're empty enough.
+ *
+ * THE JOURNAL:
+ *
+ * Bcache's journal is not necessary for consistency; we always strictly
+ * order metadata writes so that the btree and everything else is consistent on
+ * disk in the event of an unclean shutdown, and in fact bcache had writeback
+ * caching (with recovery from unclean shutdown) before journalling was
+ * implemented.
+ *
+ * Rather, the journal is purely a performance optimization; we can't complete a
+ * write until we've updated the index on disk, otherwise the cache would be
+ * inconsistent in the event of an unclean shutdown. This means that without the
+ * journal, on random write workloads we constantly have to update all the leaf
+ * nodes in the btree, and those writes will be mostly empty (appending at most
+ * a few keys each) - highly inefficient in terms of amount of metadata writes,
+ * and it puts more strain on the various btree resorting/compacting code.
+ *
+ * The journal is just a log of keys we've inserted; on startup we just reinsert
+ * all the keys in the open journal entries. That means that when we're updating
+ * a node in the btree, we can wait until a 4k block of keys fills up before
+ * writing them out.
+ *
+ * For simplicity, we only journal updates to leaf nodes; updates to parent
+ * nodes are rare enough (since our leaf nodes are huge) that it wasn't worth
+ * the complexity to deal with journalling them (in particular, journal replay)
+ * - updates to non leaf nodes just happen synchronously (see btree_split()).
+ */
+
+#define pr_fmt(fmt) "bcache: %s() " fmt "\n", __func__
+
+#include <linux/bio.h>
+#include <linux/blktrace_api.h>
+#include <linux/kobject.h>
+#include <linux/list.h>
+#include <linux/mutex.h>
+#include <linux/rbtree.h>
+#include <linux/rwsem.h>
+#include <linux/types.h>
+#include <linux/workqueue.h>
+
+#include "util.h"
+#include "closure.h"
+
+struct bucket {
+       atomic_t        pin;
+       uint16_t        prio;
+       uint8_t         gen;
+       uint8_t         disk_gen;
+       uint8_t         last_gc; /* Most out of date gen in the btree */
+       uint8_t         gc_gen;
+       uint16_t        gc_mark;
+};
+
+/*
+ * I'd use bitfields for these, but I don't trust the compiler not to screw me
+ * as multiple threads touch struct bucket without locking
+ */
+
+BITMASK(GC_MARK,        struct bucket, gc_mark, 0, 2);
+#define GC_MARK_RECLAIMABLE    0
+#define GC_MARK_DIRTY          1
+#define GC_MARK_METADATA       2
+BITMASK(GC_SECTORS_USED, struct bucket, gc_mark, 2, 14);
+
+struct bkey {
+       uint64_t        high;
+       uint64_t        low;
+       uint64_t        ptr[];
+};
+
+/* Enough for a key with 6 pointers */
+#define BKEY_PAD               8
+
+#define BKEY_PADDED(key)                                       \
+       union { struct bkey key; uint64_t key ## _pad[BKEY_PAD]; }
+
+/* Version 0: Cache device
+ * Version 1: Backing device
+ * Version 2: Seed pointer into btree node checksum
+ * Version 3: Cache device with new UUID format
+ * Version 4: Backing device with data offset
+ */
+#define BCACHE_SB_VERSION_CDEV                 0
+#define BCACHE_SB_VERSION_BDEV                 1
+#define BCACHE_SB_VERSION_CDEV_WITH_UUID       3
+#define BCACHE_SB_VERSION_BDEV_WITH_OFFSET     4
+#define BCACHE_SB_MAX_VERSION                  4
+
+#define SB_SECTOR              8
+#define SB_SIZE                        4096
+#define SB_LABEL_SIZE          32
+#define SB_JOURNAL_BUCKETS     256U
+/* SB_JOURNAL_BUCKETS must be divisible by BITS_PER_LONG */
+#define MAX_CACHES_PER_SET     8
+
+#define BDEV_DATA_START_DEFAULT        16      /* sectors */
+
+struct cache_sb {
+       uint64_t                csum;
+       uint64_t                offset; /* sector where this sb was written */
+       uint64_t                version;
+
+       uint8_t                 magic[16];
+
+       uint8_t                 uuid[16];
+       union {
+               uint8_t         set_uuid[16];
+               uint64_t        set_magic;
+       };
+       uint8_t                 label[SB_LABEL_SIZE];
+
+       uint64_t                flags;
+       uint64_t                seq;
+       uint64_t                pad[8];
+
+       union {
+       struct {
+               /* Cache devices */
+               uint64_t        nbuckets;       /* device size */
+
+               uint16_t        block_size;     /* sectors */
+               uint16_t        bucket_size;    /* sectors */
+
+               uint16_t        nr_in_set;
+               uint16_t        nr_this_dev;
+       };
+       struct {
+               /* Backing devices */
+               uint64_t        data_offset;
+
+               /*
+                * block_size from the cache device section is still used by
+                * backing devices, so don't add anything here until we fix
+                * things to not need it for backing devices anymore
+                */
+       };
+       };
+
+       uint32_t                last_mount;     /* time_t */
+
+       uint16_t                first_bucket;
+       union {
+               uint16_t        njournal_buckets;
+               uint16_t        keys;
+       };
+       uint64_t                d[SB_JOURNAL_BUCKETS];  /* journal buckets */
+};
+
+BITMASK(CACHE_SYNC,            struct cache_sb, flags, 0, 1);
+BITMASK(CACHE_DISCARD,         struct cache_sb, flags, 1, 1);
+BITMASK(CACHE_REPLACEMENT,     struct cache_sb, flags, 2, 3);
+#define CACHE_REPLACEMENT_LRU  0U
+#define CACHE_REPLACEMENT_FIFO 1U
+#define CACHE_REPLACEMENT_RANDOM 2U
+
+BITMASK(BDEV_CACHE_MODE,       struct cache_sb, flags, 0, 4);
+#define CACHE_MODE_WRITETHROUGH        0U
+#define CACHE_MODE_WRITEBACK   1U
+#define CACHE_MODE_WRITEAROUND 2U
+#define CACHE_MODE_NONE                3U
+BITMASK(BDEV_STATE,            struct cache_sb, flags, 61, 2);
+#define BDEV_STATE_NONE                0U
+#define BDEV_STATE_CLEAN       1U
+#define BDEV_STATE_DIRTY       2U
+#define BDEV_STATE_STALE       3U
+
+/* Version 1: Seed pointer into btree node checksum
+ */
+#define BCACHE_BSET_VERSION    1
+
+/*
+ * This is the on disk format for btree nodes - a btree node on disk is a list
+ * of these; within each set the keys are sorted
+ */
+struct bset {
+       uint64_t                csum;
+       uint64_t                magic;
+       uint64_t                seq;
+       uint32_t                version;
+       uint32_t                keys;
+
+       union {
+               struct bkey     start[0];
+               uint64_t        d[0];
+       };
+};
+
+/*
+ * On disk format for priorities and gens - see super.c near prio_write() for
+ * more.
+ */
+struct prio_set {
+       uint64_t                csum;
+       uint64_t                magic;
+       uint64_t                seq;
+       uint32_t                version;
+       uint32_t                pad;
+
+       uint64_t                next_bucket;
+
+       struct bucket_disk {
+               uint16_t        prio;
+               uint8_t         gen;
+       } __attribute((packed)) data[];
+};
+
+struct uuid_entry {
+       union {
+               struct {
+                       uint8_t         uuid[16];
+                       uint8_t         label[32];
+                       uint32_t        first_reg;
+                       uint32_t        last_reg;
+                       uint32_t        invalidated;
+
+                       uint32_t        flags;
+                       /* Size of flash only volumes */
+                       uint64_t        sectors;
+               };
+
+               uint8_t pad[128];
+       };
+};
+
+BITMASK(UUID_FLASH_ONLY,       struct uuid_entry, flags, 0, 1);
+
+#include "journal.h"
+#include "stats.h"
+struct search;
+struct btree;
+struct keybuf;
+
+struct keybuf_key {
+       struct rb_node          node;
+       BKEY_PADDED(key);
+       void                    *private;
+};
+
+typedef bool (keybuf_pred_fn)(struct keybuf *, struct bkey *);
+
+struct keybuf {
+       keybuf_pred_fn          *key_predicate;
+
+       struct bkey             last_scanned;
+       spinlock_t              lock;
+
+       /*
+        * Beginning and end of range in rb tree - so that we can skip taking
+        * lock and checking the rb tree when we need to check for overlapping
+        * keys.
+        */
+       struct bkey             start;
+       struct bkey             end;
+
+       struct rb_root          keys;
+
+#define KEYBUF_NR              100
+       DECLARE_ARRAY_ALLOCATOR(struct keybuf_key, freelist, KEYBUF_NR);
+};
+
+struct bio_split_pool {
+       struct bio_set          *bio_split;
+       mempool_t               *bio_split_hook;
+};
+
+struct bio_split_hook {
+       struct closure          cl;
+       struct bio_split_pool   *p;
+       struct bio              *bio;
+       bio_end_io_t            *bi_end_io;
+       void                    *bi_private;
+};
+
+struct bcache_device {
+       struct closure          cl;
+
+       struct kobject          kobj;
+
+       struct cache_set        *c;
+       unsigned                id;
+#define BCACHEDEVNAME_SIZE     12
+       char                    name[BCACHEDEVNAME_SIZE];
+
+       struct gendisk          *disk;
+
+       /* If nonzero, we're closing */
+       atomic_t                closing;
+
+       /* If nonzero, we're detaching/unregistering from cache set */
+       atomic_t                detaching;
+
+       atomic_long_t           sectors_dirty;
+       unsigned long           sectors_dirty_gc;
+       unsigned long           sectors_dirty_last;
+       long                    sectors_dirty_derivative;
+
+       mempool_t               *unaligned_bvec;
+       struct bio_set          *bio_split;
+
+       unsigned                data_csum:1;
+
+       int (*cache_miss)(struct btree *, struct search *,
+                         struct bio *, unsigned);
+       int (*ioctl) (struct bcache_device *, fmode_t, unsigned, unsigned long);
+
+       struct bio_split_pool   bio_split_hook;
+};
+
+struct io {
+       /* Used to track sequential IO so it can be skipped */
+       struct hlist_node       hash;
+       struct list_head        lru;
+
+       unsigned long           jiffies;
+       unsigned                sequential;
+       sector_t                last;
+};
+
+struct cached_dev {
+       struct list_head        list;
+       struct bcache_device    disk;
+       struct block_device     *bdev;
+
+       struct cache_sb         sb;
+       struct bio              sb_bio;
+       struct bio_vec          sb_bv[1];
+       struct closure_with_waitlist sb_write;
+
+       /* Refcount on the cache set. Always nonzero when we're caching. */
+       atomic_t                count;
+       struct work_struct      detach;
+
+       /*
+        * Device might not be running if it's dirty and the cache set hasn't
+        * showed up yet.
+        */
+       atomic_t                running;
+
+       /*
+        * Writes take a shared lock from start to finish; scanning for dirty
+        * data to refill the rb tree requires an exclusive lock.
+        */
+       struct rw_semaphore     writeback_lock;
+
+       /*
+        * Nonzero, and writeback has a refcount (d->count), iff there is dirty
+        * data in the cache. Protected by writeback_lock; must have an
+        * shared lock to set and exclusive lock to clear.
+        */
+       atomic_t                has_dirty;
+
+       struct ratelimit        writeback_rate;
+       struct delayed_work     writeback_rate_update;
+
+       /*
+        * Internal to the writeback code, so read_dirty() can keep track of
+        * where it's at.
+        */
+       sector_t                last_read;
+
+       /* Number of writeback bios in flight */
+       atomic_t                in_flight;
+       struct closure_with_timer writeback;
+       struct closure_waitlist writeback_wait;
+
+       struct keybuf           writeback_keys;
+
+       /* For tracking sequential IO */
+#define RECENT_IO_BITS 7
+#define RECENT_IO      (1 << RECENT_IO_BITS)
+       struct io               io[RECENT_IO];
+       struct hlist_head       io_hash[RECENT_IO + 1];
+       struct list_head        io_lru;
+       spinlock_t              io_lock;
+
+       struct cache_accounting accounting;
+
+       /* The rest of this all shows up in sysfs */
+       unsigned                sequential_cutoff;
+       unsigned                readahead;
+
+       unsigned                sequential_merge:1;
+       unsigned                verify:1;
+
+       unsigned                writeback_metadata:1;
+       unsigned                writeback_running:1;
+       unsigned char           writeback_percent;
+       unsigned                writeback_delay;
+
+       int                     writeback_rate_change;
+       int64_t                 writeback_rate_derivative;
+       uint64_t                writeback_rate_target;
+
+       unsigned                writeback_rate_update_seconds;
+       unsigned                writeback_rate_d_term;
+       unsigned                writeback_rate_p_term_inverse;
+       unsigned                writeback_rate_d_smooth;
+};
+
+enum alloc_watermarks {
+       WATERMARK_PRIO,
+       WATERMARK_METADATA,
+       WATERMARK_MOVINGGC,
+       WATERMARK_NONE,
+       WATERMARK_MAX
+};
+
+struct cache {
+       struct cache_set        *set;
+       struct cache_sb         sb;
+       struct bio              sb_bio;
+       struct bio_vec          sb_bv[1];
+
+       struct kobject          kobj;
+       struct block_device     *bdev;
+
+       unsigned                watermark[WATERMARK_MAX];
+
+       struct closure          alloc;
+       struct workqueue_struct *alloc_workqueue;
+
+       struct closure          prio;
+       struct prio_set         *disk_buckets;
+
+       /*
+        * When allocating new buckets, prio_write() gets first dibs - since we
+        * may not be allocate at all without writing priorities and gens.
+        * prio_buckets[] contains the last buckets we wrote priorities to (so
+        * gc can mark them as metadata), prio_next[] contains the buckets
+        * allocated for the next prio write.
+        */
+       uint64_t                *prio_buckets;
+       uint64_t                *prio_last_buckets;
+
+       /*
+        * free: Buckets that are ready to be used
+        *
+        * free_inc: Incoming buckets - these are buckets that currently have
+        * cached data in them, and we can't reuse them until after we write
+        * their new gen to disk. After prio_write() finishes writing the new
+        * gens/prios, they'll be moved to the free list (and possibly discarded
+        * in the process)
+        *
+        * unused: GC found nothing pointing into these buckets (possibly
+        * because all the data they contained was overwritten), so we only
+        * need to discard them before they can be moved to the free list.
+        */
+       DECLARE_FIFO(long, free);
+       DECLARE_FIFO(long, free_inc);
+       DECLARE_FIFO(long, unused);
+
+       size_t                  fifo_last_bucket;
+
+       /* Allocation stuff: */
+       struct bucket           *buckets;
+
+       DECLARE_HEAP(struct bucket *, heap);
+
+       /*
+        * max(gen - disk_gen) for all buckets. When it gets too big we have to
+        * call prio_write() to keep gens from wrapping.
+        */
+       uint8_t                 need_save_prio;
+       unsigned                gc_move_threshold;
+
+       /*
+        * If nonzero, we know we aren't going to find any buckets to invalidate
+        * until a gc finishes - otherwise we could pointlessly burn a ton of
+        * cpu
+        */
+       unsigned                invalidate_needs_gc:1;
+
+       bool                    discard; /* Get rid of? */
+
+       /*
+        * We preallocate structs for issuing discards to buckets, and keep them
+        * on this list when they're not in use; do_discard() issues discards
+        * whenever there's work to do and is called by free_some_buckets() and
+        * when a discard finishes.
+        */
+       atomic_t                discards_in_flight;
+       struct list_head        discards;
+
+       struct journal_device   journal;
+
+       /* The rest of this all shows up in sysfs */
+#define IO_ERROR_SHIFT         20
+       atomic_t                io_errors;
+       atomic_t                io_count;
+
+       atomic_long_t           meta_sectors_written;
+       atomic_long_t           btree_sectors_written;
+       atomic_long_t           sectors_written;
+
+       struct bio_split_pool   bio_split_hook;
+};
+
+struct gc_stat {
+       size_t                  nodes;
+       size_t                  key_bytes;
+
+       size_t                  nkeys;
+       uint64_t                data;   /* sectors */
+       uint64_t                dirty;  /* sectors */
+       unsigned                in_use; /* percent */
+};
+
+/*
+ * Flag bits, for how the cache set is shutting down, and what phase it's at:
+ *
+ * CACHE_SET_UNREGISTERING means we're not just shutting down, we're detaching
+ * all the backing devices first (their cached data gets invalidated, and they
+ * won't automatically reattach).
+ *
+ * CACHE_SET_STOPPING always gets set first when we're closing down a cache set;
+ * we'll continue to run normally for awhile with CACHE_SET_STOPPING set (i.e.
+ * flushing dirty data).
+ *
+ * CACHE_SET_STOPPING_2 gets set at the last phase, when it's time to shut down
+ * the allocation thread.
+ */
+#define CACHE_SET_UNREGISTERING                0
+#define        CACHE_SET_STOPPING              1
+#define        CACHE_SET_STOPPING_2            2
+
+struct cache_set {
+       struct closure          cl;
+
+       struct list_head        list;
+       struct kobject          kobj;
+       struct kobject          internal;
+       struct dentry           *debug;
+       struct cache_accounting accounting;
+
+       unsigned long           flags;
+
+       struct cache_sb         sb;
+
+       struct cache            *cache[MAX_CACHES_PER_SET];
+       struct cache            *cache_by_alloc[MAX_CACHES_PER_SET];
+       int                     caches_loaded;
+
+       struct bcache_device    **devices;
+       struct list_head        cached_devs;
+       uint64_t                cached_dev_sectors;
+       struct closure          caching;
+
+       struct closure_with_waitlist sb_write;
+
+       mempool_t               *search;
+       mempool_t               *bio_meta;
+       struct bio_set          *bio_split;
+
+       /* For the btree cache */
+       struct shrinker         shrink;
+
+       /* For the allocator itself */
+       wait_queue_head_t       alloc_wait;
+
+       /* For the btree cache and anything allocation related */
+       struct mutex            bucket_lock;
+
+       /* log2(bucket_size), in sectors */
+       unsigned short          bucket_bits;
+
+       /* log2(block_size), in sectors */
+       unsigned short          block_bits;
+
+       /*
+        * Default number of pages for a new btree node - may be less than a
+        * full bucket
+        */
+       unsigned                btree_pages;
+
+       /*
+        * Lists of struct btrees; lru is the list for structs that have memory
+        * allocated for actual btree node, freed is for structs that do not.
+        *
+        * We never free a struct btree, except on shutdown - we just put it on
+        * the btree_cache_freed list and reuse it later. This simplifies the
+        * code, and it doesn't cost us much memory as the memory usage is
+        * dominated by buffers that hold the actual btree node data and those
+        * can be freed - and the number of struct btrees allocated is
+        * effectively bounded.
+        *
+        * btree_cache_freeable effectively is a small cache - we use it because
+        * high order page allocations can be rather expensive, and it's quite
+        * common to delete and allocate btree nodes in quick succession. It
+        * should never grow past ~2-3 nodes in practice.
+        */
+       struct list_head        btree_cache;
+       struct list_head        btree_cache_freeable;
+       struct list_head        btree_cache_freed;
+
+       /* Number of elements in btree_cache + btree_cache_freeable lists */
+       unsigned                bucket_cache_used;
+
+       /*
+        * If we need to allocate memory for a new btree node and that
+        * allocation fails, we can cannibalize another node in the btree cache
+        * to satisfy the allocation. However, only one thread can be doing this
+        * at a time, for obvious reasons - try_harder and try_wait are
+        * basically a lock for this that we can wait on asynchronously. The
+        * btree_root() macro releases the lock when it returns.
+        */
+       struct closure          *try_harder;
+       struct closure_waitlist try_wait;
+       uint64_t                try_harder_start;
+
+       /*
+        * When we free a btree node, we increment the gen of the bucket the
+        * node is in - but we can't rewrite the prios and gens until we
+        * finished whatever it is we were doing, otherwise after a crash the
+        * btree node would be freed but for say a split, we might not have the
+        * pointers to the new nodes inserted into the btree yet.
+        *
+        * This is a refcount that blocks prio_write() until the new keys are
+        * written.
+        */
+       atomic_t                prio_blocked;
+       struct closure_waitlist bucket_wait;
+
+       /*
+        * For any bio we don't skip we subtract the number of sectors from
+        * rescale; when it hits 0 we rescale all the bucket priorities.
+        */
+       atomic_t                rescale;
+       /*
+        * When we invalidate buckets, we use both the priority and the amount
+        * of good data to determine which buckets to reuse first - to weight
+        * those together consistently we keep track of the smallest nonzero
+        * priority of any bucket.
+        */
+       uint16_t                min_prio;
+
+       /*
+        * max(gen - gc_gen) for all buckets. When it gets too big we have to gc
+        * to keep gens from wrapping around.
+        */
+       uint8_t                 need_gc;
+       struct gc_stat          gc_stats;
+       size_t                  nbuckets;
+
+       struct closure_with_waitlist gc;
+       /* Where in the btree gc currently is */
+       struct bkey             gc_done;
+
+       /*
+        * The allocation code needs gc_mark in struct bucket to be correct, but
+        * it's not while a gc is in progress. Protected by bucket_lock.
+        */
+       int                     gc_mark_valid;
+
+       /* Counts how many sectors bio_insert has added to the cache */
+       atomic_t                sectors_to_gc;
+
+       struct closure          moving_gc;
+       struct closure_waitlist moving_gc_wait;
+       struct keybuf           moving_gc_keys;
+       /* Number of moving GC bios in flight */
+       atomic_t                in_flight;
+
+       struct btree            *root;
+
+#ifdef CONFIG_BCACHE_DEBUG
+       struct btree            *verify_data;
+       struct mutex            verify_lock;
+#endif
+
+       unsigned                nr_uuids;
+       struct uuid_entry       *uuids;
+       BKEY_PADDED(uuid_bucket);
+       struct closure_with_waitlist uuid_write;
+
+       /*
+        * A btree node on disk could have too many bsets for an iterator to fit
+        * on the stack - this is a single element mempool for btree_read_work()
+        */
+       struct mutex            fill_lock;
+       struct btree_iter       *fill_iter;
+
+       /*
+        * btree_sort() is a merge sort and requires temporary space - single
+        * element mempool
+        */
+       struct mutex            sort_lock;
+       struct bset             *sort;
+
+       /* List of buckets we're currently writing data to */
+       struct list_head        data_buckets;
+       spinlock_t              data_bucket_lock;
+
+       struct journal          journal;
+
+#define CONGESTED_MAX          1024
+       unsigned                congested_last_us;
+       atomic_t                congested;
+
+       /* The rest of this all shows up in sysfs */
+       unsigned                congested_read_threshold_us;
+       unsigned                congested_write_threshold_us;
+
+       spinlock_t              sort_time_lock;
+       struct time_stats       sort_time;
+       struct time_stats       btree_gc_time;
+       struct time_stats       btree_split_time;
+       spinlock_t              btree_read_time_lock;
+       struct time_stats       btree_read_time;
+       struct time_stats       try_harder_time;
+
+       atomic_long_t           cache_read_races;
+       atomic_long_t           writeback_keys_done;
+       atomic_long_t           writeback_keys_failed;
+       unsigned                error_limit;
+       unsigned                error_decay;
+       unsigned short          journal_delay_ms;
+       unsigned                verify:1;
+       unsigned                key_merging_disabled:1;
+       unsigned                gc_always_rewrite:1;
+       unsigned                shrinker_disabled:1;
+       unsigned                copy_gc_enabled:1;
+
+#define BUCKET_HASH_BITS       12
+       struct hlist_head       bucket_hash[1 << BUCKET_HASH_BITS];
+};
+
+static inline bool key_merging_disabled(struct cache_set *c)
+{
+#ifdef CONFIG_BCACHE_DEBUG
+       return c->key_merging_disabled;
+#else
+       return 0;
+#endif
+}
+
+static inline bool SB_IS_BDEV(const struct cache_sb *sb)
+{
+       return sb->version == BCACHE_SB_VERSION_BDEV
+               || sb->version == BCACHE_SB_VERSION_BDEV_WITH_OFFSET;
+}
+
+struct bbio {
+       unsigned                submit_time_us;
+       union {
+               struct bkey     key;
+               uint64_t        _pad[3];
+               /*
+                * We only need pad = 3 here because we only ever carry around a
+                * single pointer - i.e. the pointer we're doing io to/from.
+                */
+       };
+       struct bio              bio;
+};
+
+static inline unsigned local_clock_us(void)
+{
+       return local_clock() >> 10;
+}
+
+#define MAX_BSETS              4U
+
+#define BTREE_PRIO             USHRT_MAX
+#define INITIAL_PRIO           32768
+
+#define btree_bytes(c)         ((c)->btree_pages * PAGE_SIZE)
+#define btree_blocks(b)                                                        \
+       ((unsigned) (KEY_SIZE(&b->key) >> (b)->c->block_bits))
+
+#define btree_default_blocks(c)                                                \
+       ((unsigned) ((PAGE_SECTORS * (c)->btree_pages) >> (c)->block_bits))
+
+#define bucket_pages(c)                ((c)->sb.bucket_size / PAGE_SECTORS)
+#define bucket_bytes(c)                ((c)->sb.bucket_size << 9)
+#define block_bytes(c)         ((c)->sb.block_size << 9)
+
+#define __set_bytes(i, k)      (sizeof(*(i)) + (k) * sizeof(uint64_t))
+#define set_bytes(i)           __set_bytes(i, i->keys)
+
+#define __set_blocks(i, k, c)  DIV_ROUND_UP(__set_bytes(i, k), block_bytes(c))
+#define set_blocks(i, c)       __set_blocks(i, (i)->keys, c)
+
+#define node(i, j)             ((struct bkey *) ((i)->d + (j)))
+#define end(i)                 node(i, (i)->keys)
+
+#define index(i, b)                                                    \
+       ((size_t) (((void *) i - (void *) (b)->sets[0].data) /          \
+                  block_bytes(b->c)))
+
+#define btree_data_space(b)    (PAGE_SIZE << (b)->page_order)
+
+#define prios_per_bucket(c)                            \
+       ((bucket_bytes(c) - sizeof(struct prio_set)) /  \
+        sizeof(struct bucket_disk))
+#define prio_buckets(c)                                        \
+       DIV_ROUND_UP((size_t) (c)->sb.nbuckets, prios_per_bucket(c))
+
+#define JSET_MAGIC             0x245235c1a3625032ULL
+#define PSET_MAGIC             0x6750e15f87337f91ULL
+#define BSET_MAGIC             0x90135c78b99e07f5ULL
+
+#define jset_magic(c)          ((c)->sb.set_magic ^ JSET_MAGIC)
+#define pset_magic(c)          ((c)->sb.set_magic ^ PSET_MAGIC)
+#define bset_magic(c)          ((c)->sb.set_magic ^ BSET_MAGIC)
+
+/* Bkey fields: all units are in sectors */
+
+#define KEY_FIELD(name, field, offset, size)                           \
+       BITMASK(name, struct bkey, field, offset, size)
+
+#define PTR_FIELD(name, offset, size)                                  \
+       static inline uint64_t name(const struct bkey *k, unsigned i)   \
+       { return (k->ptr[i] >> offset) & ~(((uint64_t) ~0) << size); }  \
+                                                                       \
+       static inline void SET_##name(struct bkey *k, unsigned i, uint64_t v)\
+       {                                                               \
+               k->ptr[i] &= ~(~((uint64_t) ~0 << size) << offset);     \
+               k->ptr[i] |= v << offset;                               \
+       }
+
+KEY_FIELD(KEY_PTRS,    high, 60, 3)
+KEY_FIELD(HEADER_SIZE, high, 58, 2)
+KEY_FIELD(KEY_CSUM,    high, 56, 2)
+KEY_FIELD(KEY_PINNED,  high, 55, 1)
+KEY_FIELD(KEY_DIRTY,   high, 36, 1)
+
+KEY_FIELD(KEY_SIZE,    high, 20, 16)
+KEY_FIELD(KEY_INODE,   high, 0,  20)
+
+/* Next time I change the on disk format, KEY_OFFSET() won't be 64 bits */
+
+static inline uint64_t KEY_OFFSET(const struct bkey *k)
+{
+       return k->low;
+}
+
+static inline void SET_KEY_OFFSET(struct bkey *k, uint64_t v)
+{
+       k->low = v;
+}
+
+PTR_FIELD(PTR_DEV,             51, 12)
+PTR_FIELD(PTR_OFFSET,          8,  43)
+PTR_FIELD(PTR_GEN,             0,  8)
+
+#define PTR_CHECK_DEV          ((1 << 12) - 1)
+
+#define PTR(gen, offset, dev)                                          \
+       ((((uint64_t) dev) << 51) | ((uint64_t) offset) << 8 | gen)
+
+static inline size_t sector_to_bucket(struct cache_set *c, sector_t s)
+{
+       return s >> c->bucket_bits;
+}
+
+static inline sector_t bucket_to_sector(struct cache_set *c, size_t b)
+{
+       return ((sector_t) b) << c->bucket_bits;
+}
+
+static inline sector_t bucket_remainder(struct cache_set *c, sector_t s)
+{
+       return s & (c->sb.bucket_size - 1);
+}
+
+static inline struct cache *PTR_CACHE(struct cache_set *c,
+                                     const struct bkey *k,
+                                     unsigned ptr)
+{
+       return c->cache[PTR_DEV(k, ptr)];
+}
+
+static inline size_t PTR_BUCKET_NR(struct cache_set *c,
+                                  const struct bkey *k,
+                                  unsigned ptr)
+{
+       return sector_to_bucket(c, PTR_OFFSET(k, ptr));
+}
+
+static inline struct bucket *PTR_BUCKET(struct cache_set *c,
+                                       const struct bkey *k,
+                                       unsigned ptr)
+{
+       return PTR_CACHE(c, k, ptr)->buckets + PTR_BUCKET_NR(c, k, ptr);
+}
+
+/* Btree key macros */
+
+/*
+ * The high bit being set is a relic from when we used it to do binary
+ * searches - it told you where a key started. It's not used anymore,
+ * and can probably be safely dropped.
+ */
+#define KEY(dev, sector, len)                                          \
+((struct bkey) {                                                       \
+       .high = (1ULL << 63) | ((uint64_t) (len) << 20) | (dev),        \
+       .low = (sector)                                                 \
+})
+
+static inline void bkey_init(struct bkey *k)
+{
+       *k = KEY(0, 0, 0);
+}
+
+#define KEY_START(k)           (KEY_OFFSET(k) - KEY_SIZE(k))
+#define START_KEY(k)           KEY(KEY_INODE(k), KEY_START(k), 0)
+#define MAX_KEY                        KEY(~(~0 << 20), ((uint64_t) ~0) >> 1, 0)
+#define ZERO_KEY               KEY(0, 0, 0)
+
+/*
+ * This is used for various on disk data structures - cache_sb, prio_set, bset,
+ * jset: The checksum is _always_ the first 8 bytes of these structs
+ */
+#define csum_set(i)                                                    \
+       bch_crc64(((void *) (i)) + sizeof(uint64_t),                    \
+             ((void *) end(i)) - (((void *) (i)) + sizeof(uint64_t)))
+
+/* Error handling macros */
+
+#define btree_bug(b, ...)                                              \
+do {                                                                   \
+       if (bch_cache_set_error((b)->c, __VA_ARGS__))                   \
+               dump_stack();                                           \
+} while (0)
+
+#define cache_bug(c, ...)                                              \
+do {                                                                   \
+       if (bch_cache_set_error(c, __VA_ARGS__))                        \
+               dump_stack();                                           \
+} while (0)
+
+#define btree_bug_on(cond, b, ...)                                     \
+do {                                                                   \
+       if (cond)                                                       \
+               btree_bug(b, __VA_ARGS__);                              \
+} while (0)
+
+#define cache_bug_on(cond, c, ...)                                     \
+do {                                                                   \
+       if (cond)                                                       \
+               cache_bug(c, __VA_ARGS__);                              \
+} while (0)
+
+#define cache_set_err_on(cond, c, ...)                                 \
+do {                                                                   \
+       if (cond)                                                       \
+               bch_cache_set_error(c, __VA_ARGS__);                    \
+} while (0)
+
+/* Looping macros */
+
+#define for_each_cache(ca, cs, iter)                                   \
+       for (iter = 0; ca = cs->cache[iter], iter < (cs)->sb.nr_in_set; iter++)
+
+#define for_each_bucket(b, ca)                                         \
+       for (b = (ca)->buckets + (ca)->sb.first_bucket;                 \
+            b < (ca)->buckets + (ca)->sb.nbuckets; b++)
+
+static inline void __bkey_put(struct cache_set *c, struct bkey *k)
+{
+       unsigned i;
+
+       for (i = 0; i < KEY_PTRS(k); i++)
+               atomic_dec_bug(&PTR_BUCKET(c, k, i)->pin);
+}
+
+/* Blktrace macros */
+
+#define blktrace_msg(c, fmt, ...)                                      \
+do {                                                                   \
+       struct request_queue *q = bdev_get_queue(c->bdev);              \
+       if (q)                                                          \
+               blk_add_trace_msg(q, fmt, ##__VA_ARGS__);               \
+} while (0)
+
+#define blktrace_msg_all(s, fmt, ...)                                  \
+do {                                                                   \
+       struct cache *_c;                                               \
+       unsigned i;                                                     \
+       for_each_cache(_c, (s), i)                                      \
+               blktrace_msg(_c, fmt, ##__VA_ARGS__);                   \
+} while (0)
+
+static inline void cached_dev_put(struct cached_dev *dc)
+{
+       if (atomic_dec_and_test(&dc->count))
+               schedule_work(&dc->detach);
+}
+
+static inline bool cached_dev_get(struct cached_dev *dc)
+{
+       if (!atomic_inc_not_zero(&dc->count))
+               return false;
+
+       /* Paired with the mb in cached_dev_attach */
+       smp_mb__after_atomic_inc();
+       return true;
+}
+
+/*
+ * bucket_gc_gen() returns the difference between the bucket's current gen and
+ * the oldest gen of any pointer into that bucket in the btree (last_gc).
+ *
+ * bucket_disk_gen() returns the difference between the current gen and the gen
+ * on disk; they're both used to make sure gens don't wrap around.
+ */
+
+static inline uint8_t bucket_gc_gen(struct bucket *b)
+{
+       return b->gen - b->last_gc;
+}
+
+static inline uint8_t bucket_disk_gen(struct bucket *b)
+{
+       return b->gen - b->disk_gen;
+}
+
+#define BUCKET_GC_GEN_MAX      96U
+#define BUCKET_DISK_GEN_MAX    64U
+
+#define kobj_attribute_write(n, fn)                                    \
+       static struct kobj_attribute ksysfs_##n = __ATTR(n, S_IWUSR, NULL, fn)
+
+#define kobj_attribute_rw(n, show, store)                              \
+       static struct kobj_attribute ksysfs_##n =                       \
+               __ATTR(n, S_IWUSR|S_IRUSR, show, store)
+
+/* Forward declarations */
+
+void bch_writeback_queue(struct cached_dev *);
+void bch_writeback_add(struct cached_dev *, unsigned);
+
+void bch_count_io_errors(struct cache *, int, const char *);
+void bch_bbio_count_io_errors(struct cache_set *, struct bio *,
+                             int, const char *);
+void bch_bbio_endio(struct cache_set *, struct bio *, int, const char *);
+void bch_bbio_free(struct bio *, struct cache_set *);
+struct bio *bch_bbio_alloc(struct cache_set *);
+
+struct bio *bch_bio_split(struct bio *, int, gfp_t, struct bio_set *);
+void bch_generic_make_request(struct bio *, struct bio_split_pool *);
+void __bch_submit_bbio(struct bio *, struct cache_set *);
+void bch_submit_bbio(struct bio *, struct cache_set *, struct bkey *, unsigned);
+
+uint8_t bch_inc_gen(struct cache *, struct bucket *);
+void bch_rescale_priorities(struct cache_set *, int);
+bool bch_bucket_add_unused(struct cache *, struct bucket *);
+void bch_allocator_thread(struct closure *);
+
+long bch_bucket_alloc(struct cache *, unsigned, struct closure *);
+void bch_bucket_free(struct cache_set *, struct bkey *);
+
+int __bch_bucket_alloc_set(struct cache_set *, unsigned,
+                          struct bkey *, int, struct closure *);
+int bch_bucket_alloc_set(struct cache_set *, unsigned,
+                        struct bkey *, int, struct closure *);
+
+__printf(2, 3)
+bool bch_cache_set_error(struct cache_set *, const char *, ...);
+
+void bch_prio_write(struct cache *);
+void bch_write_bdev_super(struct cached_dev *, struct closure *);
+
+extern struct workqueue_struct *bcache_wq, *bch_gc_wq;
+extern const char * const bch_cache_modes[];
+extern struct mutex bch_register_lock;
+extern struct list_head bch_cache_sets;
+
+extern struct kobj_type bch_cached_dev_ktype;
+extern struct kobj_type bch_flash_dev_ktype;
+extern struct kobj_type bch_cache_set_ktype;
+extern struct kobj_type bch_cache_set_internal_ktype;
+extern struct kobj_type bch_cache_ktype;
+
+void bch_cached_dev_release(struct kobject *);
+void bch_flash_dev_release(struct kobject *);
+void bch_cache_set_release(struct kobject *);
+void bch_cache_release(struct kobject *);
+
+int bch_uuid_write(struct cache_set *);
+void bcache_write_super(struct cache_set *);
+
+int bch_flash_dev_create(struct cache_set *c, uint64_t size);
+
+int bch_cached_dev_attach(struct cached_dev *, struct cache_set *);
+void bch_cached_dev_detach(struct cached_dev *);
+void bch_cached_dev_run(struct cached_dev *);
+void bcache_device_stop(struct bcache_device *);
+
+void bch_cache_set_unregister(struct cache_set *);
+void bch_cache_set_stop(struct cache_set *);
+
+struct cache_set *bch_cache_set_alloc(struct cache_sb *);
+void bch_btree_cache_free(struct cache_set *);
+int bch_btree_cache_alloc(struct cache_set *);
+void bch_writeback_init_cached_dev(struct cached_dev *);
+void bch_moving_init_cache_set(struct cache_set *);
+
+void bch_cache_allocator_exit(struct cache *ca);
+int bch_cache_allocator_init(struct cache *ca);
+
+void bch_debug_exit(void);
+int bch_debug_init(struct kobject *);
+void bch_writeback_exit(void);
+int bch_writeback_init(void);
+void bch_request_exit(void);
+int bch_request_init(void);
+void bch_btree_exit(void);
+int bch_btree_init(void);
+
+#endif /* _BCACHE_H */
diff --git a/drivers/md/bcache/bset.c b/drivers/md/bcache/bset.c

new file mode 100644 (file)

index 0000000..cb4578a
--- /dev/null
+++ b/drivers/md/bcache/bset.c
@@ -0,0 +1,1192 @@
+/*
+ * Code for working with individual keys, and sorted sets of keys with in a
+ * btree node
+ *
+ * Copyright 2012 Google, Inc.
+ */
+
+#include "bcache.h"
+#include "btree.h"
+#include "debug.h"
+
+#include <linux/random.h>
+#include <linux/prefetch.h>
+
+/* Keylists */
+
+void bch_keylist_copy(struct keylist *dest, struct keylist *src)
+{
+       *dest = *src;
+
+       if (src->list == src->d) {
+               size_t n = (uint64_t *) src->top - src->d;
+               dest->top = (struct bkey *) &dest->d[n];
+               dest->list = dest->d;
+       }
+}
+
+int bch_keylist_realloc(struct keylist *l, int nptrs, struct cache_set *c)
+{
+       unsigned oldsize = (uint64_t *) l->top - l->list;
+       unsigned newsize = oldsize + 2 + nptrs;
+       uint64_t *new;
+
+       /* The journalling code doesn't handle the case where the keys to insert
+        * is bigger than an empty write: If we just return -ENOMEM here,
+        * bio_insert() and bio_invalidate() will insert the keys created so far
+        * and finish the rest when the keylist is empty.
+        */
+       if (newsize * sizeof(uint64_t) > block_bytes(c) - sizeof(struct jset))
+               return -ENOMEM;
+
+       newsize = roundup_pow_of_two(newsize);
+
+       if (newsize <= KEYLIST_INLINE ||
+           roundup_pow_of_two(oldsize) == newsize)
+               return 0;
+
+       new = krealloc(l->list == l->d ? NULL : l->list,
+                      sizeof(uint64_t) * newsize, GFP_NOIO);
+
+       if (!new)
+               return -ENOMEM;
+
+       if (l->list == l->d)
+               memcpy(new, l->list, sizeof(uint64_t) * KEYLIST_INLINE);
+
+       l->list = new;
+       l->top = (struct bkey *) (&l->list[oldsize]);
+
+       return 0;
+}
+
+struct bkey *bch_keylist_pop(struct keylist *l)
+{
+       struct bkey *k = l->bottom;
+
+       if (k == l->top)
+               return NULL;
+
+       while (bkey_next(k) != l->top)
+               k = bkey_next(k);
+
+       return l->top = k;
+}
+
+/* Pointer validation */
+
+bool __bch_ptr_invalid(struct cache_set *c, int level, const struct bkey *k)
+{
+       unsigned i;
+
+       if (level && (!KEY_PTRS(k) || !KEY_SIZE(k) || KEY_DIRTY(k)))
+               goto bad;
+
+       if (!level && KEY_SIZE(k) > KEY_OFFSET(k))
+               goto bad;
+
+       if (!KEY_SIZE(k))
+               return true;
+
+       for (i = 0; i < KEY_PTRS(k); i++)
+               if (ptr_available(c, k, i)) {
+                       struct cache *ca = PTR_CACHE(c, k, i);
+                       size_t bucket = PTR_BUCKET_NR(c, k, i);
+                       size_t r = bucket_remainder(c, PTR_OFFSET(k, i));
+
+                       if (KEY_SIZE(k) + r > c->sb.bucket_size ||
+                           bucket <  ca->sb.first_bucket ||
+                           bucket >= ca->sb.nbuckets)
+                               goto bad;
+               }
+
+       return false;
+bad:
+       cache_bug(c, "spotted bad key %s: %s", pkey(k), bch_ptr_status(c, k));
+       return true;
+}
+
+bool bch_ptr_bad(struct btree *b, const struct bkey *k)
+{
+       struct bucket *g;
+       unsigned i, stale;
+
+       if (!bkey_cmp(k, &ZERO_KEY) ||
+           !KEY_PTRS(k) ||
+           bch_ptr_invalid(b, k))
+               return true;
+
+       if (KEY_PTRS(k) && PTR_DEV(k, 0) == PTR_CHECK_DEV)
+               return true;
+
+       for (i = 0; i < KEY_PTRS(k); i++)
+               if (ptr_available(b->c, k, i)) {
+                       g = PTR_BUCKET(b->c, k, i);
+                       stale = ptr_stale(b->c, k, i);
+
+                       btree_bug_on(stale > 96, b,
+                                    "key too stale: %i, need_gc %u",
+                                    stale, b->c->need_gc);
+
+                       btree_bug_on(stale && KEY_DIRTY(k) && KEY_SIZE(k),
+                                    b, "stale dirty pointer");
+
+                       if (stale)
+                               return true;
+
+#ifdef CONFIG_BCACHE_EDEBUG
+                       if (!mutex_trylock(&b->c->bucket_lock))
+                               continue;
+
+                       if (b->level) {
+                               if (KEY_DIRTY(k) ||
+                                   g->prio != BTREE_PRIO ||
+                                   (b->c->gc_mark_valid &&
+                                    GC_MARK(g) != GC_MARK_METADATA))
+                                       goto bug;
+
+                       } else {
+                               if (g->prio == BTREE_PRIO)
+                                       goto bug;
+
+                               if (KEY_DIRTY(k) &&
+                                   b->c->gc_mark_valid &&
+                                   GC_MARK(g) != GC_MARK_DIRTY)
+                                       goto bug;
+                       }
+                       mutex_unlock(&b->c->bucket_lock);
+#endif
+               }
+
+       return false;
+#ifdef CONFIG_BCACHE_EDEBUG
+bug:
+       mutex_unlock(&b->c->bucket_lock);
+       btree_bug(b,
+"inconsistent pointer %s: bucket %zu pin %i prio %i gen %i last_gc %i mark %llu gc_gen %i",
+                 pkey(k), PTR_BUCKET_NR(b->c, k, i), atomic_read(&g->pin),
+                 g->prio, g->gen, g->last_gc, GC_MARK(g), g->gc_gen);
+       return true;
+#endif
+}
+
+/* Key/pointer manipulation */
+
+void bch_bkey_copy_single_ptr(struct bkey *dest, const struct bkey *src,
+                             unsigned i)
+{
+       BUG_ON(i > KEY_PTRS(src));
+
+       /* Only copy the header, key, and one pointer. */
+       memcpy(dest, src, 2 * sizeof(uint64_t));
+       dest->ptr[0] = src->ptr[i];
+       SET_KEY_PTRS(dest, 1);
+       /* We didn't copy the checksum so clear that bit. */
+       SET_KEY_CSUM(dest, 0);
+}
+
+bool __bch_cut_front(const struct bkey *where, struct bkey *k)
+{
+       unsigned i, len = 0;
+
+       if (bkey_cmp(where, &START_KEY(k)) <= 0)
+               return false;
+
+       if (bkey_cmp(where, k) < 0)
+               len = KEY_OFFSET(k) - KEY_OFFSET(where);
+       else
+               bkey_copy_key(k, where);
+
+       for (i = 0; i < KEY_PTRS(k); i++)
+               SET_PTR_OFFSET(k, i, PTR_OFFSET(k, i) + KEY_SIZE(k) - len);
+
+       BUG_ON(len > KEY_SIZE(k));
+       SET_KEY_SIZE(k, len);
+       return true;
+}
+
+bool __bch_cut_back(const struct bkey *where, struct bkey *k)
+{
+       unsigned len = 0;
+
+       if (bkey_cmp(where, k) >= 0)
+               return false;
+
+       BUG_ON(KEY_INODE(where) != KEY_INODE(k));
+
+       if (bkey_cmp(where, &START_KEY(k)) > 0)
+               len = KEY_OFFSET(where) - KEY_START(k);
+
+       bkey_copy_key(k, where);
+
+       BUG_ON(len > KEY_SIZE(k));
+       SET_KEY_SIZE(k, len);
+       return true;
+}
+
+static uint64_t merge_chksums(struct bkey *l, struct bkey *r)
+{
+       return (l->ptr[KEY_PTRS(l)] + r->ptr[KEY_PTRS(r)]) &
+               ~((uint64_t)1 << 63);
+}
+
+/* Tries to merge l and r: l should be lower than r
+ * Returns true if we were able to merge. If we did merge, l will be the merged
+ * key, r will be untouched.
+ */
+bool bch_bkey_try_merge(struct btree *b, struct bkey *l, struct bkey *r)
+{
+       unsigned i;
+
+       if (key_merging_disabled(b->c))
+               return false;
+
+       if (KEY_PTRS(l) != KEY_PTRS(r) ||
+           KEY_DIRTY(l) != KEY_DIRTY(r) ||
+           bkey_cmp(l, &START_KEY(r)))
+               return false;
+
+       for (i = 0; i < KEY_PTRS(l); i++)
+               if (l->ptr[i] + PTR(0, KEY_SIZE(l), 0) != r->ptr[i] ||
+                   PTR_BUCKET_NR(b->c, l, i) != PTR_BUCKET_NR(b->c, r, i))
+                       return false;
+
+       /* Keys with no pointers aren't restricted to one bucket and could
+        * overflow KEY_SIZE
+        */
+       if (KEY_SIZE(l) + KEY_SIZE(r) > USHRT_MAX) {
+               SET_KEY_OFFSET(l, KEY_OFFSET(l) + USHRT_MAX - KEY_SIZE(l));
+               SET_KEY_SIZE(l, USHRT_MAX);
+
+               bch_cut_front(l, r);
+               return false;
+       }
+
+       if (KEY_CSUM(l)) {
+               if (KEY_CSUM(r))
+                       l->ptr[KEY_PTRS(l)] = merge_chksums(l, r);
+               else
+                       SET_KEY_CSUM(l, 0);
+       }
+
+       SET_KEY_OFFSET(l, KEY_OFFSET(l) + KEY_SIZE(r));
+       SET_KEY_SIZE(l, KEY_SIZE(l) + KEY_SIZE(r));
+
+       return true;
+}
+
+/* Binary tree stuff for auxiliary search trees */
+
+static unsigned inorder_next(unsigned j, unsigned size)
+{
+       if (j * 2 + 1 < size) {
+               j = j * 2 + 1;
+
+               while (j * 2 < size)
+                       j *= 2;
+       } else
+               j >>= ffz(j) + 1;
+
+       return j;
+}
+
+static unsigned inorder_prev(unsigned j, unsigned size)
+{
+       if (j * 2 < size) {
+               j = j * 2;
+
+               while (j * 2 + 1 < size)
+                       j = j * 2 + 1;
+       } else
+               j >>= ffs(j);
+
+       return j;
+}
+
+/* I have no idea why this code works... and I'm the one who wrote it
+ *
+ * However, I do know what it does:
+ * Given a binary tree constructed in an array (i.e. how you normally implement
+ * a heap), it converts a node in the tree - referenced by array index - to the
+ * index it would have if you did an inorder traversal.
+ *
+ * Also tested for every j, size up to size somewhere around 6 million.
+ *
+ * The binary tree starts at array index 1, not 0
+ * extra is a function of size:
+ *   extra = (size - rounddown_pow_of_two(size - 1)) << 1;
+ */
+static unsigned __to_inorder(unsigned j, unsigned size, unsigned extra)
+{
+       unsigned b = fls(j);
+       unsigned shift = fls(size - 1) - b;
+
+       j  ^= 1U << (b - 1);
+       j <<= 1;
+       j  |= 1;
+       j <<= shift;
+
+       if (j > extra)
+               j -= (j - extra) >> 1;
+
+       return j;
+}
+
+static unsigned to_inorder(unsigned j, struct bset_tree *t)
+{
+       return __to_inorder(j, t->size, t->extra);
+}
+
+static unsigned __inorder_to_tree(unsigned j, unsigned size, unsigned extra)
+{
+       unsigned shift;
+
+       if (j > extra)
+               j += j - extra;
+
+       shift = ffs(j);
+
+       j >>= shift;
+       j  |= roundup_pow_of_two(size) >> shift;
+
+       return j;
+}
+
+static unsigned inorder_to_tree(unsigned j, struct bset_tree *t)
+{
+       return __inorder_to_tree(j, t->size, t->extra);
+}
+
+#if 0
+void inorder_test(void)
+{
+       unsigned long done = 0;
+       ktime_t start = ktime_get();
+
+       for (unsigned size = 2;
+            size < 65536000;
+            size++) {
+               unsigned extra = (size - rounddown_pow_of_two(size - 1)) << 1;
+               unsigned i = 1, j = rounddown_pow_of_two(size - 1);
+
+               if (!(size % 4096))
+                       printk(KERN_NOTICE "loop %u, %llu per us\n", size,
+                              done / ktime_us_delta(ktime_get(), start));
+
+               while (1) {
+                       if (__inorder_to_tree(i, size, extra) != j)
+                               panic("size %10u j %10u i %10u", size, j, i);
+
+                       if (__to_inorder(j, size, extra) != i)
+                               panic("size %10u j %10u i %10u", size, j, i);
+
+                       if (j == rounddown_pow_of_two(size) - 1)
+                               break;
+
+                       BUG_ON(inorder_prev(inorder_next(j, size), size) != j);
+
+                       j = inorder_next(j, size);
+                       i++;
+               }
+
+               done += size - 1;
+       }
+}
+#endif
+
+/*
+ * Cacheline/offset <-> bkey pointer arithmatic:
+ *
+ * t->tree is a binary search tree in an array; each node corresponds to a key
+ * in one cacheline in t->set (BSET_CACHELINE bytes).
+ *
+ * This means we don't have to store the full index of the key that a node in
+ * the binary tree points to; to_inorder() gives us the cacheline, and then
+ * bkey_float->m gives us the offset within that cacheline, in units of 8 bytes.
+ *
+ * cacheline_to_bkey() and friends abstract out all the pointer arithmatic to
+ * make this work.
+ *
+ * To construct the bfloat for an arbitrary key we need to know what the key
+ * immediately preceding it is: we have to check if the two keys differ in the
+ * bits we're going to store in bkey_float->mantissa. t->prev[j] stores the size
+ * of the previous key so we can walk backwards to it from t->tree[j]'s key.
+ */
+
+static struct bkey *cacheline_to_bkey(struct bset_tree *t, unsigned cacheline,
+                                     unsigned offset)
+{
+       return ((void *) t->data) + cacheline * BSET_CACHELINE + offset * 8;
+}
+
+static unsigned bkey_to_cacheline(struct bset_tree *t, struct bkey *k)
+{
+       return ((void *) k - (void *) t->data) / BSET_CACHELINE;
+}
+
+static unsigned bkey_to_cacheline_offset(struct bkey *k)
+{
+       return ((size_t) k & (BSET_CACHELINE - 1)) / sizeof(uint64_t);
+}
+
+static struct bkey *tree_to_bkey(struct bset_tree *t, unsigned j)
+{
+       return cacheline_to_bkey(t, to_inorder(j, t), t->tree[j].m);
+}
+
+static struct bkey *tree_to_prev_bkey(struct bset_tree *t, unsigned j)
+{
+       return (void *) (((uint64_t *) tree_to_bkey(t, j)) - t->prev[j]);
+}
+
+/*
+ * For the write set - the one we're currently inserting keys into - we don't
+ * maintain a full search tree, we just keep a simple lookup table in t->prev.
+ */
+static struct bkey *table_to_bkey(struct bset_tree *t, unsigned cacheline)
+{
+       return cacheline_to_bkey(t, cacheline, t->prev[cacheline]);
+}
+
+static inline uint64_t shrd128(uint64_t high, uint64_t low, uint8_t shift)
+{
+#ifdef CONFIG_X86_64
+       asm("shrd %[shift],%[high],%[low]"
+           : [low] "+Rm" (low)
+           : [high] "R" (high),
+           [shift] "ci" (shift)
+           : "cc");
+#else
+       low >>= shift;
+       low  |= (high << 1) << (63U - shift);
+#endif
+       return low;
+}
+
+static inline unsigned bfloat_mantissa(const struct bkey *k,
+                                      struct bkey_float *f)
+{
+       const uint64_t *p = &k->low - (f->exponent >> 6);
+       return shrd128(p[-1], p[0], f->exponent & 63) & BKEY_MANTISSA_MASK;
+}
+
+static void make_bfloat(struct bset_tree *t, unsigned j)
+{
+       struct bkey_float *f = &t->tree[j];
+       struct bkey *m = tree_to_bkey(t, j);
+       struct bkey *p = tree_to_prev_bkey(t, j);
+
+       struct bkey *l = is_power_of_2(j)
+               ? t->data->start
+               : tree_to_prev_bkey(t, j >> ffs(j));
+
+       struct bkey *r = is_power_of_2(j + 1)
+               ? node(t->data, t->data->keys - bkey_u64s(&t->end))
+               : tree_to_bkey(t, j >> (ffz(j) + 1));
+
+       BUG_ON(m < l || m > r);
+       BUG_ON(bkey_next(p) != m);
+
+       if (KEY_INODE(l) != KEY_INODE(r))
+               f->exponent = fls64(KEY_INODE(r) ^ KEY_INODE(l)) + 64;
+       else
+               f->exponent = fls64(r->low ^ l->low);
+
+       f->exponent = max_t(int, f->exponent - BKEY_MANTISSA_BITS, 0);
+
+       /*
+        * Setting f->exponent = 127 flags this node as failed, and causes the
+        * lookup code to fall back to comparing against the original key.
+        */
+
+       if (bfloat_mantissa(m, f) != bfloat_mantissa(p, f))
+               f->mantissa = bfloat_mantissa(m, f) - 1;
+       else
+               f->exponent = 127;
+}
+
+static void bset_alloc_tree(struct btree *b, struct bset_tree *t)
+{
+       if (t != b->sets) {
+               unsigned j = roundup(t[-1].size,
+                                    64 / sizeof(struct bkey_float));
+
+               t->tree = t[-1].tree + j;
+               t->prev = t[-1].prev + j;
+       }
+
+       while (t < b->sets + MAX_BSETS)
+               t++->size = 0;
+}
+
+static void bset_build_unwritten_tree(struct btree *b)
+{
+       struct bset_tree *t = b->sets + b->nsets;
+
+       bset_alloc_tree(b, t);
+
+       if (t->tree != b->sets->tree + bset_tree_space(b)) {
+               t->prev[0] = bkey_to_cacheline_offset(t->data->start);
+               t->size = 1;
+       }
+}
+
+static void bset_build_written_tree(struct btree *b)
+{
+       struct bset_tree *t = b->sets + b->nsets;
+       struct bkey *k = t->data->start;
+       unsigned j, cacheline = 1;
+
+       bset_alloc_tree(b, t);
+
+       t->size = min_t(unsigned,
+                       bkey_to_cacheline(t, end(t->data)),
+                       b->sets->tree + bset_tree_space(b) - t->tree);
+
+       if (t->size < 2) {
+               t->size = 0;
+               return;
+       }
+
+       t->extra = (t->size - rounddown_pow_of_two(t->size - 1)) << 1;
+
+       /* First we figure out where the first key in each cacheline is */
+       for (j = inorder_next(0, t->size);
+            j;
+            j = inorder_next(j, t->size)) {
+               while (bkey_to_cacheline(t, k) != cacheline)
+                       k = bkey_next(k);
+
+               t->prev[j] = bkey_u64s(k);
+               k = bkey_next(k);
+               cacheline++;
+               t->tree[j].m = bkey_to_cacheline_offset(k);
+       }
+
+       while (bkey_next(k) != end(t->data))
+               k = bkey_next(k);
+
+       t->end = *k;
+
+       /* Then we build the tree */
+       for (j = inorder_next(0, t->size);
+            j;
+            j = inorder_next(j, t->size))
+               make_bfloat(t, j);
+}
+
+void bch_bset_fix_invalidated_key(struct btree *b, struct bkey *k)
+{
+       struct bset_tree *t;
+       unsigned inorder, j = 1;
+
+       for (t = b->sets; t <= &b->sets[b->nsets]; t++)
+               if (k < end(t->data))
+                       goto found_set;
+
+       BUG();
+found_set:
+       if (!t->size || !bset_written(b, t))
+               return;
+
+       inorder = bkey_to_cacheline(t, k);
+
+       if (k == t->data->start)
+               goto fix_left;
+
+       if (bkey_next(k) == end(t->data)) {
+               t->end = *k;
+               goto fix_right;
+       }
+
+       j = inorder_to_tree(inorder, t);
+
+       if (j &&
+           j < t->size &&
+           k == tree_to_bkey(t, j))
+fix_left:      do {
+                       make_bfloat(t, j);
+                       j = j * 2;
+               } while (j < t->size);
+
+       j = inorder_to_tree(inorder + 1, t);
+
+       if (j &&
+           j < t->size &&
+           k == tree_to_prev_bkey(t, j))
+fix_right:     do {
+                       make_bfloat(t, j);
+                       j = j * 2 + 1;
+               } while (j < t->size);
+}
+
+void bch_bset_fix_lookup_table(struct btree *b, struct bkey *k)
+{
+       struct bset_tree *t = &b->sets[b->nsets];
+       unsigned shift = bkey_u64s(k);
+       unsigned j = bkey_to_cacheline(t, k);
+
+       /* We're getting called from btree_split() or btree_gc, just bail out */
+       if (!t->size)
+               return;
+
+       /* k is the key we just inserted; we need to find the entry in the
+        * lookup table for the first key that is strictly greater than k:
+        * it's either k's cacheline or the next one
+        */
+       if (j < t->size &&
+           table_to_bkey(t, j) <= k)
+               j++;
+
+       /* Adjust all the lookup table entries, and find a new key for any that
+        * have gotten too big
+        */
+       for (; j < t->size; j++) {
+               t->prev[j] += shift;
+
+               if (t->prev[j] > 7) {
+                       k = table_to_bkey(t, j - 1);
+
+                       while (k < cacheline_to_bkey(t, j, 0))
+                               k = bkey_next(k);
+
+                       t->prev[j] = bkey_to_cacheline_offset(k);
+               }
+       }
+
+       if (t->size == b->sets->tree + bset_tree_space(b) - t->tree)
+               return;
+
+       /* Possibly add a new entry to the end of the lookup table */
+
+       for (k = table_to_bkey(t, t->size - 1);
+            k != end(t->data);
+            k = bkey_next(k))
+               if (t->size == bkey_to_cacheline(t, k)) {
+                       t->prev[t->size] = bkey_to_cacheline_offset(k);
+                       t->size++;
+               }
+}
+
+void bch_bset_init_next(struct btree *b)
+{
+       struct bset *i = write_block(b);
+
+       if (i != b->sets[0].data) {
+               b->sets[++b->nsets].data = i;
+               i->seq = b->sets[0].data->seq;
+       } else
+               get_random_bytes(&i->seq, sizeof(uint64_t));
+
+       i->magic        = bset_magic(b->c);
+       i->version      = 0;
+       i->keys         = 0;
+
+       bset_build_unwritten_tree(b);
+}
+
+struct bset_search_iter {
+       struct bkey *l, *r;
+};
+
+static struct bset_search_iter bset_search_write_set(struct btree *b,
+                                                    struct bset_tree *t,
+                                                    const struct bkey *search)
+{
+       unsigned li = 0, ri = t->size;
+
+       BUG_ON(!b->nsets &&
+              t->size < bkey_to_cacheline(t, end(t->data)));
+
+       while (li + 1 != ri) {
+               unsigned m = (li + ri) >> 1;
+
+               if (bkey_cmp(table_to_bkey(t, m), search) > 0)
+                       ri = m;
+               else
+                       li = m;
+       }
+
+       return (struct bset_search_iter) {
+               table_to_bkey(t, li),
+               ri < t->size ? table_to_bkey(t, ri) : end(t->data)
+       };
+}
+
+static struct bset_search_iter bset_search_tree(struct btree *b,
+                                               struct bset_tree *t,
+                                               const struct bkey *search)
+{
+       struct bkey *l, *r;
+       struct bkey_float *f;
+       unsigned inorder, j, n = 1;
+
+       do {
+               unsigned p = n << 4;
+               p &= ((int) (p - t->size)) >> 31;
+
+               prefetch(&t->tree[p]);
+
+               j = n;
+               f = &t->tree[j];
+
+               /*
+                * n = (f->mantissa > bfloat_mantissa())
+                *      ? j * 2
+                *      : j * 2 + 1;
+                *
+                * We need to subtract 1 from f->mantissa for the sign bit trick
+                * to work  - that's done in make_bfloat()
+                */
+               if (likely(f->exponent != 127))
+                       n = j * 2 + (((unsigned)
+                                     (f->mantissa -
+                                      bfloat_mantissa(search, f))) >> 31);
+               else
+                       n = (bkey_cmp(tree_to_bkey(t, j), search) > 0)
+                               ? j * 2
+                               : j * 2 + 1;
+       } while (n < t->size);
+
+       inorder = to_inorder(j, t);
+
+       /*
+        * n would have been the node we recursed to - the low bit tells us if
+        * we recursed left or recursed right.
+        */
+       if (n & 1) {
+               l = cacheline_to_bkey(t, inorder, f->m);
+
+               if (++inorder != t->size) {
+                       f = &t->tree[inorder_next(j, t->size)];
+                       r = cacheline_to_bkey(t, inorder, f->m);
+               } else
+                       r = end(t->data);
+       } else {
+               r = cacheline_to_bkey(t, inorder, f->m);
+
+               if (--inorder) {
+                       f = &t->tree[inorder_prev(j, t->size)];
+                       l = cacheline_to_bkey(t, inorder, f->m);
+               } else
+                       l = t->data->start;
+       }
+
+       return (struct bset_search_iter) {l, r};
+}
+
+struct bkey *__bch_bset_search(struct btree *b, struct bset_tree *t,
+                              const struct bkey *search)
+{
+       struct bset_search_iter i;
+
+       /*
+        * First, we search for a cacheline, then lastly we do a linear search
+        * within that cacheline.
+        *
+        * To search for the cacheline, there's three different possibilities:
+        *  * The set is too small to have a search tree, so we just do a linear
+        *    search over the whole set.
+        *  * The set is the one we're currently inserting into; keeping a full
+        *    auxiliary search tree up to date would be too expensive, so we
+        *    use a much simpler lookup table to do a binary search -
+        *    bset_search_write_set().
+        *  * Or we use the auxiliary search tree we constructed earlier -
+        *    bset_search_tree()
+        */
+
+       if (unlikely(!t->size)) {
+               i.l = t->data->start;
+               i.r = end(t->data);
+       } else if (bset_written(b, t)) {
+               /*
+                * Each node in the auxiliary search tree covers a certain range
+                * of bits, and keys above and below the set it covers might
+                * differ outside those bits - so we have to special case the
+                * start and end - handle that here:
+                */
+
+               if (unlikely(bkey_cmp(search, &t->end) >= 0))
+                       return end(t->data);
+
+               if (unlikely(bkey_cmp(search, t->data->start) < 0))
+                       return t->data->start;
+
+               i = bset_search_tree(b, t, search);
+       } else
+               i = bset_search_write_set(b, t, search);
+
+#ifdef CONFIG_BCACHE_EDEBUG
+       BUG_ON(bset_written(b, t) &&
+              i.l != t->data->start &&
+              bkey_cmp(tree_to_prev_bkey(t,
+                 inorder_to_tree(bkey_to_cacheline(t, i.l), t)),
+                       search) > 0);
+
+       BUG_ON(i.r != end(t->data) &&
+              bkey_cmp(i.r, search) <= 0);
+#endif
+
+       while (likely(i.l != i.r) &&
+              bkey_cmp(i.l, search) <= 0)
+               i.l = bkey_next(i.l);
+
+       return i.l;
+}
+
+/* Btree iterator */
+
+static inline bool btree_iter_cmp(struct btree_iter_set l,
+                                 struct btree_iter_set r)
+{
+       int64_t c = bkey_cmp(&START_KEY(l.k), &START_KEY(r.k));
+
+       return c ? c > 0 : l.k < r.k;
+}
+
+static inline bool btree_iter_end(struct btree_iter *iter)
+{
+       return !iter->used;
+}
+
+void bch_btree_iter_push(struct btree_iter *iter, struct bkey *k,
+                        struct bkey *end)
+{
+       if (k != end)
+               BUG_ON(!heap_add(iter,
+                                ((struct btree_iter_set) { k, end }),
+                                btree_iter_cmp));
+}
+
+struct bkey *__bch_btree_iter_init(struct btree *b, struct btree_iter *iter,
+                              struct bkey *search, struct bset_tree *start)
+{
+       struct bkey *ret = NULL;
+       iter->size = ARRAY_SIZE(iter->data);
+       iter->used = 0;
+
+       for (; start <= &b->sets[b->nsets]; start++) {
+               ret = bch_bset_search(b, start, search);
+               bch_btree_iter_push(iter, ret, end(start->data));
+       }
+
+       return ret;
+}
+
+struct bkey *bch_btree_iter_next(struct btree_iter *iter)
+{
+       struct btree_iter_set unused;
+       struct bkey *ret = NULL;
+
+       if (!btree_iter_end(iter)) {
+               ret = iter->data->k;
+               iter->data->k = bkey_next(iter->data->k);
+
+               if (iter->data->k > iter->data->end) {
+                       WARN_ONCE(1, "bset was corrupt!\n");
+                       iter->data->k = iter->data->end;
+               }
+
+               if (iter->data->k == iter->data->end)
+                       heap_pop(iter, unused, btree_iter_cmp);
+               else
+                       heap_sift(iter, 0, btree_iter_cmp);
+       }
+
+       return ret;
+}
+
+struct bkey *bch_btree_iter_next_filter(struct btree_iter *iter,
+                                       struct btree *b, ptr_filter_fn fn)
+{
+       struct bkey *ret;
+
+       do {
+               ret = bch_btree_iter_next(iter);
+       } while (ret && fn(b, ret));
+
+       return ret;
+}
+
+struct bkey *bch_next_recurse_key(struct btree *b, struct bkey *search)
+{
+       struct btree_iter iter;
+
+       bch_btree_iter_init(b, &iter, search);
+       return bch_btree_iter_next_filter(&iter, b, bch_ptr_bad);
+}
+
+/* Mergesort */
+
+static void btree_sort_fixup(struct btree_iter *iter)
+{
+       while (iter->used > 1) {
+               struct btree_iter_set *top = iter->data, *i = top + 1;
+               struct bkey *k;
+
+               if (iter->used > 2 &&
+                   btree_iter_cmp(i[0], i[1]))
+                       i++;
+
+               for (k = i->k;
+                    k != i->end && bkey_cmp(top->k, &START_KEY(k)) > 0;
+                    k = bkey_next(k))
+                       if (top->k > i->k)
+                               __bch_cut_front(top->k, k);
+                       else if (KEY_SIZE(k))
+                               bch_cut_back(&START_KEY(k), top->k);
+
+               if (top->k < i->k || k == i->k)
+                       break;
+
+               heap_sift(iter, i - top, btree_iter_cmp);
+       }
+}
+
+static void btree_mergesort(struct btree *b, struct bset *out,
+                           struct btree_iter *iter,
+                           bool fixup, bool remove_stale)
+{
+       struct bkey *k, *last = NULL;
+       bool (*bad)(struct btree *, const struct bkey *) = remove_stale
+               ? bch_ptr_bad
+               : bch_ptr_invalid;
+
+       while (!btree_iter_end(iter)) {
+               if (fixup && !b->level)
+                       btree_sort_fixup(iter);
+
+               k = bch_btree_iter_next(iter);
+               if (bad(b, k))
+                       continue;
+
+               if (!last) {
+                       last = out->start;
+                       bkey_copy(last, k);
+               } else if (b->level ||
+                          !bch_bkey_try_merge(b, last, k)) {
+                       last = bkey_next(last);
+                       bkey_copy(last, k);
+               }
+       }
+
+       out->keys = last ? (uint64_t *) bkey_next(last) - out->d : 0;
+
+       pr_debug("sorted %i keys", out->keys);
+       bch_check_key_order(b, out);
+}
+
+static void __btree_sort(struct btree *b, struct btree_iter *iter,
+                        unsigned start, unsigned order, bool fixup)
+{
+       uint64_t start_time;
+       bool remove_stale = !b->written;
+       struct bset *out = (void *) __get_free_pages(__GFP_NOWARN|GFP_NOIO,
+                                                    order);
+       if (!out) {
+               mutex_lock(&b->c->sort_lock);
+               out = b->c->sort;
+               order = ilog2(bucket_pages(b->c));
+       }
+
+       start_time = local_clock();
+
+       btree_mergesort(b, out, iter, fixup, remove_stale);
+       b->nsets = start;
+
+       if (!fixup && !start && b->written)
+               bch_btree_verify(b, out);
+
+       if (!start && order == b->page_order) {
+               /*
+                * Our temporary buffer is the same size as the btree node's
+                * buffer, we can just swap buffers instead of doing a big
+                * memcpy()
+                */
+
+               out->magic      = bset_magic(b->c);
+               out->seq        = b->sets[0].data->seq;
+               out->version    = b->sets[0].data->version;
+               swap(out, b->sets[0].data);
+
+               if (b->c->sort == b->sets[0].data)
+                       b->c->sort = out;
+       } else {
+               b->sets[start].data->keys = out->keys;
+               memcpy(b->sets[start].data->start, out->start,
+                      (void *) end(out) - (void *) out->start);
+       }
+
+       if (out == b->c->sort)
+               mutex_unlock(&b->c->sort_lock);
+       else
+               free_pages((unsigned long) out, order);
+
+       if (b->written)
+               bset_build_written_tree(b);
+
+       if (!start) {
+               spin_lock(&b->c->sort_time_lock);
+               bch_time_stats_update(&b->c->sort_time, start_time);
+               spin_unlock(&b->c->sort_time_lock);
+       }
+}
+
+void bch_btree_sort_partial(struct btree *b, unsigned start)
+{
+       size_t oldsize = 0, order = b->page_order, keys = 0;
+       struct btree_iter iter;
+       __bch_btree_iter_init(b, &iter, NULL, &b->sets[start]);
+
+       BUG_ON(b->sets[b->nsets].data == write_block(b) &&
+              (b->sets[b->nsets].size || b->nsets));
+
+       if (b->written)
+               oldsize = bch_count_data(b);
+
+       if (start) {
+               unsigned i;
+
+               for (i = start; i <= b->nsets; i++)
+                       keys += b->sets[i].data->keys;
+
+               order = roundup_pow_of_two(__set_bytes(b->sets->data,
+                                                      keys)) / PAGE_SIZE;
+               if (order)
+                       order = ilog2(order);
+       }
+
+       __btree_sort(b, &iter, start, order, false);
+
+       EBUG_ON(b->written && bch_count_data(b) != oldsize);
+}
+
+void bch_btree_sort_and_fix_extents(struct btree *b, struct btree_iter *iter)
+{
+       BUG_ON(!b->written);
+       __btree_sort(b, iter, 0, b->page_order, true);
+}
+
+void bch_btree_sort_into(struct btree *b, struct btree *new)
+{
+       uint64_t start_time = local_clock();
+
+       struct btree_iter iter;
+       bch_btree_iter_init(b, &iter, NULL);
+
+       btree_mergesort(b, new->sets->data, &iter, false, true);
+
+       spin_lock(&b->c->sort_time_lock);
+       bch_time_stats_update(&b->c->sort_time, start_time);
+       spin_unlock(&b->c->sort_time_lock);
+
+       bkey_copy_key(&new->key, &b->key);
+       new->sets->size = 0;
+}
+
+void bch_btree_sort_lazy(struct btree *b)
+{
+       if (b->nsets) {
+               unsigned i, j, keys = 0, total;
+
+               for (i = 0; i <= b->nsets; i++)
+                       keys += b->sets[i].data->keys;
+
+               total = keys;
+
+               for (j = 0; j < b->nsets; j++) {
+                       if (keys * 2 < total ||
+                           keys < 1000) {
+                               bch_btree_sort_partial(b, j);
+                               return;
+                       }
+
+                       keys -= b->sets[j].data->keys;
+               }
+
+               /* Must sort if b->nsets == 3 or we'll overflow */
+               if (b->nsets >= (MAX_BSETS - 1) - b->level) {
+                       bch_btree_sort(b);
+                       return;
+               }
+       }
+
+       bset_build_written_tree(b);
+}
+
+/* Sysfs stuff */
+
+struct bset_stats {
+       size_t nodes;
+       size_t sets_written, sets_unwritten;
+       size_t bytes_written, bytes_unwritten;
+       size_t floats, failed;
+};
+
+static int bch_btree_bset_stats(struct btree *b, struct btree_op *op,
+                           struct bset_stats *stats)
+{
+       struct bkey *k;
+       unsigned i;
+
+       stats->nodes++;
+
+       for (i = 0; i <= b->nsets; i++) {
+               struct bset_tree *t = &b->sets[i];
+               size_t bytes = t->data->keys * sizeof(uint64_t);
+               size_t j;
+
+               if (bset_written(b, t)) {
+                       stats->sets_written++;
+                       stats->bytes_written += bytes;
+
+                       stats->floats += t->size - 1;
+
+                       for (j = 1; j < t->size; j++)
+                               if (t->tree[j].exponent == 127)
+                                       stats->failed++;
+               } else {
+                       stats->sets_unwritten++;
+                       stats->bytes_unwritten += bytes;
+               }
+       }
+
+       if (b->level) {
+               struct btree_iter iter;
+
+               for_each_key_filter(b, k, &iter, bch_ptr_bad) {
+                       int ret = btree(bset_stats, k, b, op, stats);
+                       if (ret)
+                               return ret;
+               }
+       }
+
+       return 0;
+}
+
+int bch_bset_print_stats(struct cache_set *c, char *buf)
+{
+       struct btree_op op;
+       struct bset_stats t;
+       int ret;
+
+       bch_btree_op_init_stack(&op);
+       memset(&t, 0, sizeof(struct bset_stats));
+
+       ret = btree_root(bset_stats, c, &op, &t);
+       if (ret)
+               return ret;
+
+       return snprintf(buf, PAGE_SIZE,
+                       "btree nodes:           %zu\n"
+                       "written sets:          %zu\n"
+                       "unwritten sets:                %zu\n"
+                       "written key bytes:     %zu\n"
+                       "unwritten key bytes:   %zu\n"
+                       "floats:                        %zu\n"
+                       "failed:                        %zu\n",
+                       t.nodes,
+                       t.sets_written, t.sets_unwritten,
+                       t.bytes_written, t.bytes_unwritten,
+                       t.floats, t.failed);
+}
diff --git a/drivers/md/bcache/bset.h b/drivers/md/bcache/bset.h

new file mode 100644 (file)

index 0000000..57a9cff
--- /dev/null
+++ b/drivers/md/bcache/bset.h
@@ -0,0 +1,379 @@
+#ifndef _BCACHE_BSET_H
+#define _BCACHE_BSET_H
+
+/*
+ * BKEYS:
+ *
+ * A bkey contains a key, a size field, a variable number of pointers, and some
+ * ancillary flag bits.
+ *
+ * We use two different functions for validating bkeys, bch_ptr_invalid and
+ * bch_ptr_bad().
+ *
+ * bch_ptr_invalid() primarily filters out keys and pointers that would be
+ * invalid due to some sort of bug, whereas bch_ptr_bad() filters out keys and
+ * pointer that occur in normal practice but don't point to real data.
+ *
+ * The one exception to the rule that ptr_invalid() filters out invalid keys is
+ * that it also filters out keys of size 0 - these are keys that have been
+ * completely overwritten. It'd be safe to delete these in memory while leaving
+ * them on disk, just unnecessary work - so we filter them out when resorting
+ * instead.
+ *
+ * We can't filter out stale keys when we're resorting, because garbage
+ * collection needs to find them to ensure bucket gens don't wrap around -
+ * unless we're rewriting the btree node those stale keys still exist on disk.
+ *
+ * We also implement functions here for removing some number of sectors from the
+ * front or the back of a bkey - this is mainly used for fixing overlapping
+ * extents, by removing the overlapping sectors from the older key.
+ *
+ * BSETS:
+ *
+ * A bset is an array of bkeys laid out contiguously in memory in sorted order,
+ * along with a header. A btree node is made up of a number of these, written at
+ * different times.
+ *
+ * There could be many of them on disk, but we never allow there to be more than
+ * 4 in memory - we lazily resort as needed.
+ *
+ * We implement code here for creating and maintaining auxiliary search trees
+ * (described below) for searching an individial bset, and on top of that we
+ * implement a btree iterator.
+ *
+ * BTREE ITERATOR:
+ *
+ * Most of the code in bcache doesn't care about an individual bset - it needs
+ * to search entire btree nodes and iterate over them in sorted order.
+ *
+ * The btree iterator code serves both functions; it iterates through the keys
+ * in a btree node in sorted order, starting from either keys after a specific
+ * point (if you pass it a search key) or the start of the btree node.
+ *
+ * AUXILIARY SEARCH TREES:
+ *
+ * Since keys are variable length, we can't use a binary search on a bset - we
+ * wouldn't be able to find the start of the next key. But binary searches are
+ * slow anyways, due to terrible cache behaviour; bcache originally used binary
+ * searches and that code topped out at under 50k lookups/second.
+ *
+ * So we need to construct some sort of lookup table. Since we only insert keys
+ * into the last (unwritten) set, most of the keys within a given btree node are
+ * usually in sets that are mostly constant. We use two different types of
+ * lookup tables to take advantage of this.
+ *
+ * Both lookup tables share in common that they don't index every key in the
+ * set; they index one key every BSET_CACHELINE bytes, and then a linear search
+ * is used for the rest.
+ *
+ * For sets that have been written to disk and are no longer being inserted
+ * into, we construct a binary search tree in an array - traversing a binary
+ * search tree in an array gives excellent locality of reference and is very
+ * fast, since both children of any node are adjacent to each other in memory
+ * (and their grandchildren, and great grandchildren...) - this means
+ * prefetching can be used to great effect.
+ *
+ * It's quite useful performance wise to keep these nodes small - not just
+ * because they're more likely to be in L2, but also because we can prefetch
+ * more nodes on a single cacheline and thus prefetch more iterations in advance
+ * when traversing this tree.
+ *
+ * Nodes in the auxiliary search tree must contain both a key to compare against
+ * (we don't want to fetch the key from the set, that would defeat the purpose),
+ * and a pointer to the key. We use a few tricks to compress both of these.
+ *
+ * To compress the pointer, we take advantage of the fact that one node in the
+ * search tree corresponds to precisely BSET_CACHELINE bytes in the set. We have
+ * a function (to_inorder()) that takes the index of a node in a binary tree and
+ * returns what its index would be in an inorder traversal, so we only have to
+ * store the low bits of the offset.
+ *
+ * The key is 84 bits (KEY_DEV + key->key, the offset on the device). To
+ * compress that,  we take advantage of the fact that when we're traversing the
+ * search tree at every iteration we know that both our search key and the key
+ * we're looking for lie within some range - bounded by our previous
+ * comparisons. (We special case the start of a search so that this is true even
+ * at the root of the tree).
+ *
+ * So we know the key we're looking for is between a and b, and a and b don't
+ * differ higher than bit 50, we don't need to check anything higher than bit
+ * 50.
+ *
+ * We don't usually need the rest of the bits, either; we only need enough bits
+ * to partition the key range we're currently checking.  Consider key n - the
+ * key our auxiliary search tree node corresponds to, and key p, the key
+ * immediately preceding n.  The lowest bit we need to store in the auxiliary
+ * search tree is the highest bit that differs between n and p.
+ *
+ * Note that this could be bit 0 - we might sometimes need all 80 bits to do the
+ * comparison. But we'd really like our nodes in the auxiliary search tree to be
+ * of fixed size.
+ *
+ * The solution is to make them fixed size, and when we're constructing a node
+ * check if p and n differed in the bits we needed them to. If they don't we
+ * flag that node, and when doing lookups we fallback to comparing against the
+ * real key. As long as this doesn't happen to often (and it seems to reliably
+ * happen a bit less than 1% of the time), we win - even on failures, that key
+ * is then more likely to be in cache than if we were doing binary searches all
+ * the way, since we're touching so much less memory.
+ *
+ * The keys in the auxiliary search tree are stored in (software) floating
+ * point, with an exponent and a mantissa. The exponent needs to be big enough
+ * to address all the bits in the original key, but the number of bits in the
+ * mantissa is somewhat arbitrary; more bits just gets us fewer failures.
+ *
+ * We need 7 bits for the exponent and 3 bits for the key's offset (since keys
+ * are 8 byte aligned); using 22 bits for the mantissa means a node is 4 bytes.
+ * We need one node per 128 bytes in the btree node, which means the auxiliary
+ * search trees take up 3% as much memory as the btree itself.
+ *
+ * Constructing these auxiliary search trees is moderately expensive, and we
+ * don't want to be constantly rebuilding the search tree for the last set
+ * whenever we insert another key into it. For the unwritten set, we use a much
+ * simpler lookup table - it's just a flat array, so index i in the lookup table
+ * corresponds to the i range of BSET_CACHELINE bytes in the set. Indexing
+ * within each byte range works the same as with the auxiliary search trees.
+ *
+ * These are much easier to keep up to date when we insert a key - we do it
+ * somewhat lazily; when we shift a key up we usually just increment the pointer
+ * to it, only when it would overflow do we go to the trouble of finding the
+ * first key in that range of bytes again.
+ */
+
+/* Btree key comparison/iteration */
+
+struct btree_iter {
+       size_t size, used;
+       struct btree_iter_set {
+               struct bkey *k, *end;
+       } data[MAX_BSETS];
+};
+
+struct bset_tree {
+       /*
+        * We construct a binary tree in an array as if the array
+        * started at 1, so that things line up on the same cachelines
+        * better: see comments in bset.c at cacheline_to_bkey() for
+        * details
+        */
+
+       /* size of the binary tree and prev array */
+       unsigned        size;
+
+       /* function of size - precalculated for to_inorder() */
+       unsigned        extra;
+
+       /* copy of the last key in the set */
+       struct bkey     end;
+       struct bkey_float *tree;
+
+       /*
+        * The nodes in the bset tree point to specific keys - this
+        * array holds the sizes of the previous key.
+        *
+        * Conceptually it's a member of struct bkey_float, but we want
+        * to keep bkey_float to 4 bytes and prev isn't used in the fast
+        * path.
+        */
+       uint8_t         *prev;
+
+       /* The actual btree node, with pointers to each sorted set */
+       struct bset     *data;
+};
+
+static __always_inline int64_t bkey_cmp(const struct bkey *l,
+                                       const struct bkey *r)
+{
+       return unlikely(KEY_INODE(l) != KEY_INODE(r))
+               ? (int64_t) KEY_INODE(l) - (int64_t) KEY_INODE(r)
+               : (int64_t) KEY_OFFSET(l) - (int64_t) KEY_OFFSET(r);
+}
+
+static inline size_t bkey_u64s(const struct bkey *k)
+{
+       BUG_ON(KEY_CSUM(k) > 1);
+       return 2 + KEY_PTRS(k) + (KEY_CSUM(k) ? 1 : 0);
+}
+
+static inline size_t bkey_bytes(const struct bkey *k)
+{
+       return bkey_u64s(k) * sizeof(uint64_t);
+}
+
+static inline void bkey_copy(struct bkey *dest, const struct bkey *src)
+{
+       memcpy(dest, src, bkey_bytes(src));
+}
+
+static inline void bkey_copy_key(struct bkey *dest, const struct bkey *src)
+{
+       if (!src)
+               src = &KEY(0, 0, 0);
+
+       SET_KEY_INODE(dest, KEY_INODE(src));
+       SET_KEY_OFFSET(dest, KEY_OFFSET(src));
+}
+
+static inline struct bkey *bkey_next(const struct bkey *k)
+{
+       uint64_t *d = (void *) k;
+       return (struct bkey *) (d + bkey_u64s(k));
+}
+
+/* Keylists */
+
+struct keylist {
+       struct bkey             *top;
+       union {
+               uint64_t                *list;
+               struct bkey             *bottom;
+       };
+
+       /* Enough room for btree_split's keys without realloc */
+#define KEYLIST_INLINE         16
+       uint64_t                d[KEYLIST_INLINE];
+};
+
+static inline void bch_keylist_init(struct keylist *l)
+{
+       l->top = (void *) (l->list = l->d);
+}
+
+static inline void bch_keylist_push(struct keylist *l)
+{
+       l->top = bkey_next(l->top);
+}
+
+static inline void bch_keylist_add(struct keylist *l, struct bkey *k)
+{
+       bkey_copy(l->top, k);
+       bch_keylist_push(l);
+}
+
+static inline bool bch_keylist_empty(struct keylist *l)
+{
+       return l->top == (void *) l->list;
+}
+
+static inline void bch_keylist_free(struct keylist *l)
+{
+       if (l->list != l->d)
+               kfree(l->list);
+}
+
+void bch_keylist_copy(struct keylist *, struct keylist *);
+struct bkey *bch_keylist_pop(struct keylist *);
+int bch_keylist_realloc(struct keylist *, int, struct cache_set *);
+
+void bch_bkey_copy_single_ptr(struct bkey *, const struct bkey *,
+                             unsigned);
+bool __bch_cut_front(const struct bkey *, struct bkey *);
+bool __bch_cut_back(const struct bkey *, struct bkey *);
+
+static inline bool bch_cut_front(const struct bkey *where, struct bkey *k)
+{
+       BUG_ON(bkey_cmp(where, k) > 0);
+       return __bch_cut_front(where, k);
+}
+
+static inline bool bch_cut_back(const struct bkey *where, struct bkey *k)
+{
+       BUG_ON(bkey_cmp(where, &START_KEY(k)) < 0);
+       return __bch_cut_back(where, k);
+}
+
+const char *bch_ptr_status(struct cache_set *, const struct bkey *);
+bool __bch_ptr_invalid(struct cache_set *, int level, const struct bkey *);
+bool bch_ptr_bad(struct btree *, const struct bkey *);
+
+static inline uint8_t gen_after(uint8_t a, uint8_t b)
+{
+       uint8_t r = a - b;
+       return r > 128U ? 0 : r;
+}
+
+static inline uint8_t ptr_stale(struct cache_set *c, const struct bkey *k,
+                               unsigned i)
+{
+       return gen_after(PTR_BUCKET(c, k, i)->gen, PTR_GEN(k, i));
+}
+
+static inline bool ptr_available(struct cache_set *c, const struct bkey *k,
+                                unsigned i)
+{
+       return (PTR_DEV(k, i) < MAX_CACHES_PER_SET) && PTR_CACHE(c, k, i);
+}
+
+
+typedef bool (*ptr_filter_fn)(struct btree *, const struct bkey *);
+
+struct bkey *bch_next_recurse_key(struct btree *, struct bkey *);
+struct bkey *bch_btree_iter_next(struct btree_iter *);
+struct bkey *bch_btree_iter_next_filter(struct btree_iter *,
+                                       struct btree *, ptr_filter_fn);
+
+void bch_btree_iter_push(struct btree_iter *, struct bkey *, struct bkey *);
+struct bkey *__bch_btree_iter_init(struct btree *, struct btree_iter *,
+                                  struct bkey *, struct bset_tree *);
+
+/* 32 bits total: */
+#define BKEY_MID_BITS          3
+#define BKEY_EXPONENT_BITS     7
+#define BKEY_MANTISSA_BITS     22
+#define BKEY_MANTISSA_MASK     ((1 << BKEY_MANTISSA_BITS) - 1)
+
+struct bkey_float {
+       unsigned        exponent:BKEY_EXPONENT_BITS;
+       unsigned        m:BKEY_MID_BITS;
+       unsigned        mantissa:BKEY_MANTISSA_BITS;
+} __packed;
+
+/*
+ * BSET_CACHELINE was originally intended to match the hardware cacheline size -
+ * it used to be 64, but I realized the lookup code would touch slightly less
+ * memory if it was 128.
+ *
+ * It definites the number of bytes (in struct bset) per struct bkey_float in
+ * the auxiliar search tree - when we're done searching the bset_float tree we
+ * have this many bytes left that we do a linear search over.
+ *
+ * Since (after level 5) every level of the bset_tree is on a new cacheline,
+ * we're touching one fewer cacheline in the bset tree in exchange for one more
+ * cacheline in the linear search - but the linear search might stop before it
+ * gets to the second cacheline.
+ */
+
+#define BSET_CACHELINE         128
+#define bset_tree_space(b)     (btree_data_space(b) / BSET_CACHELINE)
+
+#define bset_tree_bytes(b)     (bset_tree_space(b) * sizeof(struct bkey_float))
+#define bset_prev_bytes(b)     (bset_tree_space(b) * sizeof(uint8_t))
+
+void bch_bset_init_next(struct btree *);
+
+void bch_bset_fix_invalidated_key(struct btree *, struct bkey *);
+void bch_bset_fix_lookup_table(struct btree *, struct bkey *);
+
+struct bkey *__bch_bset_search(struct btree *, struct bset_tree *,
+                          const struct bkey *);
+
+static inline struct bkey *bch_bset_search(struct btree *b, struct bset_tree *t,
+                                          const struct bkey *search)
+{
+       return search ? __bch_bset_search(b, t, search) : t->data->start;
+}
+
+bool bch_bkey_try_merge(struct btree *, struct bkey *, struct bkey *);
+void bch_btree_sort_lazy(struct btree *);
+void bch_btree_sort_into(struct btree *, struct btree *);
+void bch_btree_sort_and_fix_extents(struct btree *, struct btree_iter *);
+void bch_btree_sort_partial(struct btree *, unsigned);
+
+static inline void bch_btree_sort(struct btree *b)
+{
+       bch_btree_sort_partial(b, 0);
+}
+
+int bch_bset_print_stats(struct cache_set *, char *);
+
+#endif
diff --git a/drivers/md/bcache/btree.c b/drivers/md/bcache/btree.c

new file mode 100644 (file)

index 0000000..7a5658f
--- /dev/null
+++ b/drivers/md/bcache/btree.c
@@ -0,0 +1,2503 @@
+/*
+ * Copyright (C) 2010 Kent Overstreet <kent.overstreet@gmail.com>
+ *
+ * Uses a block device as cache for other block devices; optimized for SSDs.
+ * All allocation is done in buckets, which should match the erase block size
+ * of the device.
+ *
+ * Buckets containing cached data are kept on a heap sorted by priority;
+ * bucket priority is increased on cache hit, and periodically all the buckets
+ * on the heap have their priority scaled down. This currently is just used as
+ * an LRU but in the future should allow for more intelligent heuristics.
+ *
+ * Buckets have an 8 bit counter; freeing is accomplished by incrementing the
+ * counter. Garbage collection is used to remove stale pointers.
+ *
+ * Indexing is done via a btree; nodes are not necessarily fully sorted, rather
+ * as keys are inserted we only sort the pages that have not yet been written.
+ * When garbage collection is run, we resort the entire node.
+ *
+ * All configuration is done via sysfs; see Documentation/bcache.txt.
+ */
+
+#include "bcache.h"
+#include "btree.h"
+#include "debug.h"
+#include "request.h"
+
+#include <linux/slab.h>
+#include <linux/bitops.h>
+#include <linux/hash.h>
+#include <linux/prefetch.h>
+#include <linux/random.h>
+#include <linux/rcupdate.h>
+#include <trace/events/bcache.h>
+
+/*
+ * Todo:
+ * register_bcache: Return errors out to userspace correctly
+ *
+ * Writeback: don't undirty key until after a cache flush
+ *
+ * Create an iterator for key pointers
+ *
+ * On btree write error, mark bucket such that it won't be freed from the cache
+ *
+ * Journalling:
+ *   Check for bad keys in replay
+ *   Propagate barriers
+ *   Refcount journal entries in journal_replay
+ *
+ * Garbage collection:
+ *   Finish incremental gc
+ *   Gc should free old UUIDs, data for invalid UUIDs
+ *
+ * Provide a way to list backing device UUIDs we have data cached for, and
+ * probably how long it's been since we've seen them, and a way to invalidate
+ * dirty data for devices that will never be attached again
+ *
+ * Keep 1 min/5 min/15 min statistics of how busy a block device has been, so
+ * that based on that and how much dirty data we have we can keep writeback
+ * from being starved
+ *
+ * Add a tracepoint or somesuch to watch for writeback starvation
+ *
+ * When btree depth > 1 and splitting an interior node, we have to make sure
+ * alloc_bucket() cannot fail. This should be true but is not completely
+ * obvious.
+ *
+ * Make sure all allocations get charged to the root cgroup
+ *
+ * Plugging?
+ *
+ * If data write is less than hard sector size of ssd, round up offset in open
+ * bucket to the next whole sector
+ *
+ * Also lookup by cgroup in get_open_bucket()
+ *
+ * Superblock needs to be fleshed out for multiple cache devices
+ *
+ * Add a sysfs tunable for the number of writeback IOs in flight
+ *
+ * Add a sysfs tunable for the number of open data buckets
+ *
+ * IO tracking: Can we track when one process is doing io on behalf of another?
+ * IO tracking: Don't use just an average, weigh more recent stuff higher
+ *
+ * Test module load/unload
+ */
+
+static const char * const op_types[] = {
+       "insert", "replace"
+};
+
+static const char *op_type(struct btree_op *op)
+{
+       return op_types[op->type];
+}
+
+#define MAX_NEED_GC            64
+#define MAX_SAVE_PRIO          72
+
+#define PTR_DIRTY_BIT          (((uint64_t) 1 << 36))
+
+#define PTR_HASH(c, k)                                                 \
+       (((k)->ptr[0] >> c->bucket_bits) | PTR_GEN(k, 0))
+
+struct workqueue_struct *bch_gc_wq;
+static struct workqueue_struct *btree_io_wq;
+
+void bch_btree_op_init_stack(struct btree_op *op)
+{
+       memset(op, 0, sizeof(struct btree_op));
+       closure_init_stack(&op->cl);
+       op->lock = -1;
+       bch_keylist_init(&op->keys);
+}
+
+/* Btree key manipulation */
+
+static void bkey_put(struct cache_set *c, struct bkey *k, int level)
+{
+       if ((level && KEY_OFFSET(k)) || !level)
+               __bkey_put(c, k);
+}
+
+/* Btree IO */
+
+static uint64_t btree_csum_set(struct btree *b, struct bset *i)
+{
+       uint64_t crc = b->key.ptr[0];
+       void *data = (void *) i + 8, *end = end(i);
+
+       crc = bch_crc64_update(crc, data, end - data);
+       return crc ^ 0xffffffffffffffffULL;
+}
+
+static void btree_bio_endio(struct bio *bio, int error)
+{
+       struct closure *cl = bio->bi_private;
+       struct btree *b = container_of(cl, struct btree, io.cl);
+
+       if (error)
+               set_btree_node_io_error(b);
+
+       bch_bbio_count_io_errors(b->c, bio, error, (bio->bi_rw & WRITE)
+                                ? "writing btree" : "reading btree");
+       closure_put(cl);
+}
+
+static void btree_bio_init(struct btree *b)
+{
+       BUG_ON(b->bio);
+       b->bio = bch_bbio_alloc(b->c);
+
+       b->bio->bi_end_io       = btree_bio_endio;
+       b->bio->bi_private      = &b->io.cl;
+}
+
+void bch_btree_read_done(struct closure *cl)
+{
+       struct btree *b = container_of(cl, struct btree, io.cl);
+       struct bset *i = b->sets[0].data;
+       struct btree_iter *iter = b->c->fill_iter;
+       const char *err = "bad btree header";
+       BUG_ON(b->nsets || b->written);
+
+       bch_bbio_free(b->bio, b->c);
+       b->bio = NULL;
+
+       mutex_lock(&b->c->fill_lock);
+       iter->used = 0;
+
+       if (btree_node_io_error(b) ||
+           !i->seq)
+               goto err;
+
+       for (;
+            b->written < btree_blocks(b) && i->seq == b->sets[0].data->seq;
+            i = write_block(b)) {
+               err = "unsupported bset version";
+               if (i->version > BCACHE_BSET_VERSION)
+                       goto err;
+
+               err = "bad btree header";
+               if (b->written + set_blocks(i, b->c) > btree_blocks(b))
+                       goto err;
+
+               err = "bad magic";
+               if (i->magic != bset_magic(b->c))
+                       goto err;
+
+               err = "bad checksum";
+               switch (i->version) {
+               case 0:
+                       if (i->csum != csum_set(i))
+                               goto err;
+                       break;
+               case BCACHE_BSET_VERSION:
+                       if (i->csum != btree_csum_set(b, i))
+                               goto err;
+                       break;
+               }
+
+               err = "empty set";
+               if (i != b->sets[0].data && !i->keys)
+                       goto err;
+
+               bch_btree_iter_push(iter, i->start, end(i));
+
+               b->written += set_blocks(i, b->c);
+       }
+
+       err = "corrupted btree";
+       for (i = write_block(b);
+            index(i, b) < btree_blocks(b);
+            i = ((void *) i) + block_bytes(b->c))
+               if (i->seq == b->sets[0].data->seq)
+                       goto err;
+
+       bch_btree_sort_and_fix_extents(b, iter);
+
+       i = b->sets[0].data;
+       err = "short btree key";
+       if (b->sets[0].size &&
+           bkey_cmp(&b->key, &b->sets[0].end) < 0)
+               goto err;
+
+       if (b->written < btree_blocks(b))
+               bch_bset_init_next(b);
+out:
+
+       mutex_unlock(&b->c->fill_lock);
+
+       spin_lock(&b->c->btree_read_time_lock);
+       bch_time_stats_update(&b->c->btree_read_time, b->io_start_time);
+       spin_unlock(&b->c->btree_read_time_lock);
+
+       smp_wmb(); /* read_done is our write lock */
+       set_btree_node_read_done(b);
+
+       closure_return(cl);
+err:
+       set_btree_node_io_error(b);
+       bch_cache_set_error(b->c, "%s at bucket %zu, block %zu, %u keys",
+                           err, PTR_BUCKET_NR(b->c, &b->key, 0),
+                           index(i, b), i->keys);
+       goto out;
+}
+
+void bch_btree_read(struct btree *b)
+{
+       BUG_ON(b->nsets || b->written);
+
+       if (!closure_trylock(&b->io.cl, &b->c->cl))
+               BUG();
+
+       b->io_start_time = local_clock();
+
+       btree_bio_init(b);
+       b->bio->bi_rw   = REQ_META|READ_SYNC;
+       b->bio->bi_size = KEY_SIZE(&b->key) << 9;
+
+       bch_bio_map(b->bio, b->sets[0].data);
+
+       pr_debug("%s", pbtree(b));
+       trace_bcache_btree_read(b->bio);
+       bch_submit_bbio(b->bio, b->c, &b->key, 0);
+
+       continue_at(&b->io.cl, bch_btree_read_done, system_wq);
+}
+
+static void btree_complete_write(struct btree *b, struct btree_write *w)
+{
+       if (w->prio_blocked &&
+           !atomic_sub_return(w->prio_blocked, &b->c->prio_blocked))
+               wake_up(&b->c->alloc_wait);
+
+       if (w->journal) {
+               atomic_dec_bug(w->journal);
+               __closure_wake_up(&b->c->journal.wait);
+       }
+
+       if (w->owner)
+               closure_put(w->owner);
+
+       w->prio_blocked = 0;
+       w->journal      = NULL;
+       w->owner        = NULL;
+}
+
+static void __btree_write_done(struct closure *cl)
+{
+       struct btree *b = container_of(cl, struct btree, io.cl);
+       struct btree_write *w = btree_prev_write(b);
+
+       bch_bbio_free(b->bio, b->c);
+       b->bio = NULL;
+       btree_complete_write(b, w);
+
+       if (btree_node_dirty(b))
+               queue_delayed_work(btree_io_wq, &b->work,
+                                  msecs_to_jiffies(30000));
+
+       closure_return(cl);
+}
+
+static void btree_write_done(struct closure *cl)
+{
+       struct btree *b = container_of(cl, struct btree, io.cl);
+       struct bio_vec *bv;
+       int n;
+
+       __bio_for_each_segment(bv, b->bio, n, 0)
+               __free_page(bv->bv_page);
+
+       __btree_write_done(cl);
+}
+
+static void do_btree_write(struct btree *b)
+{
+       struct closure *cl = &b->io.cl;
+       struct bset *i = b->sets[b->nsets].data;
+       BKEY_PADDED(key) k;
+
+       i->version      = BCACHE_BSET_VERSION;
+       i->csum         = btree_csum_set(b, i);
+
+       btree_bio_init(b);
+       b->bio->bi_rw   = REQ_META|WRITE_SYNC;
+       b->bio->bi_size = set_blocks(i, b->c) * block_bytes(b->c);
+       bch_bio_map(b->bio, i);
+
+       bkey_copy(&k.key, &b->key);
+       SET_PTR_OFFSET(&k.key, 0, PTR_OFFSET(&k.key, 0) + bset_offset(b, i));
+
+       if (!bch_bio_alloc_pages(b->bio, GFP_NOIO)) {
+               int j;
+               struct bio_vec *bv;
+               void *base = (void *) ((unsigned long) i & ~(PAGE_SIZE - 1));
+
+               bio_for_each_segment(bv, b->bio, j)
+                       memcpy(page_address(bv->bv_page),
+                              base + j * PAGE_SIZE, PAGE_SIZE);
+
+               trace_bcache_btree_write(b->bio);
+               bch_submit_bbio(b->bio, b->c, &k.key, 0);
+
+               continue_at(cl, btree_write_done, NULL);
+       } else {
+               b->bio->bi_vcnt = 0;
+               bch_bio_map(b->bio, i);
+
+               trace_bcache_btree_write(b->bio);
+               bch_submit_bbio(b->bio, b->c, &k.key, 0);
+
+               closure_sync(cl);
+               __btree_write_done(cl);
+       }
+}
+
+static void __btree_write(struct btree *b)
+{
+       struct bset *i = b->sets[b->nsets].data;
+
+       BUG_ON(current->bio_list);
+
+       closure_lock(&b->io, &b->c->cl);
+       cancel_delayed_work(&b->work);
+
+       clear_bit(BTREE_NODE_dirty,      &b->flags);
+       change_bit(BTREE_NODE_write_idx, &b->flags);
+
+       bch_check_key_order(b, i);
+       BUG_ON(b->written && !i->keys);
+
+       do_btree_write(b);
+
+       pr_debug("%s block %i keys %i", pbtree(b), b->written, i->keys);
+
+       b->written += set_blocks(i, b->c);
+       atomic_long_add(set_blocks(i, b->c) * b->c->sb.block_size,
+                       &PTR_CACHE(b->c, &b->key, 0)->btree_sectors_written);
+
+       bch_btree_sort_lazy(b);
+
+       if (b->written < btree_blocks(b))
+               bch_bset_init_next(b);
+}
+
+static void btree_write_work(struct work_struct *w)
+{
+       struct btree *b = container_of(to_delayed_work(w), struct btree, work);
+
+       down_write(&b->lock);
+
+       if (btree_node_dirty(b))
+               __btree_write(b);
+       up_write(&b->lock);
+}
+
+void bch_btree_write(struct btree *b, bool now, struct btree_op *op)
+{
+       struct bset *i = b->sets[b->nsets].data;
+       struct btree_write *w = btree_current_write(b);
+
+       BUG_ON(b->written &&
+              (b->written >= btree_blocks(b) ||
+               i->seq != b->sets[0].data->seq ||
+               !i->keys));
+
+       if (!btree_node_dirty(b)) {
+               set_btree_node_dirty(b);
+               queue_delayed_work(btree_io_wq, &b->work,
+                                  msecs_to_jiffies(30000));
+       }
+
+       w->prio_blocked += b->prio_blocked;
+       b->prio_blocked = 0;
+
+       if (op && op->journal && !b->level) {
+               if (w->journal &&
+                   journal_pin_cmp(b->c, w, op)) {
+                       atomic_dec_bug(w->journal);
+                       w->journal = NULL;
+               }
+
+               if (!w->journal) {
+                       w->journal = op->journal;
+                       atomic_inc(w->journal);
+               }
+       }
+
+       if (current->bio_list)
+               return;
+
+       /* Force write if set is too big */
+       if (now ||
+           b->level ||
+           set_bytes(i) > PAGE_SIZE - 48) {
+               if (op && now) {
+                       /* Must wait on multiple writes */
+                       BUG_ON(w->owner);
+                       w->owner = &op->cl;
+                       closure_get(&op->cl);
+               }
+
+               __btree_write(b);
+       }
+       BUG_ON(!b->written);
+}
+
+/*
+ * Btree in memory cache - allocation/freeing
+ * mca -> memory cache
+ */
+
+static void mca_reinit(struct btree *b)
+{
+       unsigned i;
+
+       b->flags        = 0;
+       b->written      = 0;
+       b->nsets        = 0;
+
+       for (i = 0; i < MAX_BSETS; i++)
+               b->sets[i].size = 0;
+       /*
+        * Second loop starts at 1 because b->sets[0]->data is the memory we
+        * allocated
+        */
+       for (i = 1; i < MAX_BSETS; i++)
+               b->sets[i].data = NULL;
+}
+
+#define mca_reserve(c) (((c->root && c->root->level)           \
+                         ? c->root->level : 1) * 8 + 16)
+#define mca_can_free(c)                                                \
+       max_t(int, 0, c->bucket_cache_used - mca_reserve(c))
+
+static void mca_data_free(struct btree *b)
+{
+       struct bset_tree *t = b->sets;
+       BUG_ON(!closure_is_unlocked(&b->io.cl));
+
+       if (bset_prev_bytes(b) < PAGE_SIZE)
+               kfree(t->prev);
+       else
+               free_pages((unsigned long) t->prev,
+                          get_order(bset_prev_bytes(b)));
+
+       if (bset_tree_bytes(b) < PAGE_SIZE)
+               kfree(t->tree);
+       else
+               free_pages((unsigned long) t->tree,
+                          get_order(bset_tree_bytes(b)));
+
+       free_pages((unsigned long) t->data, b->page_order);
+
+       t->prev = NULL;
+       t->tree = NULL;
+       t->data = NULL;
+       list_move(&b->list, &b->c->btree_cache_freed);
+       b->c->bucket_cache_used--;
+}
+
+static void mca_bucket_free(struct btree *b)
+{
+       BUG_ON(btree_node_dirty(b));
+
+       b->key.ptr[0] = 0;
+       hlist_del_init_rcu(&b->hash);
+       list_move(&b->list, &b->c->btree_cache_freeable);
+}
+
+static unsigned btree_order(struct bkey *k)
+{
+       return ilog2(KEY_SIZE(k) / PAGE_SECTORS ?: 1);
+}
+
+static void mca_data_alloc(struct btree *b, struct bkey *k, gfp_t gfp)
+{
+       struct bset_tree *t = b->sets;
+       BUG_ON(t->data);
+
+       b->page_order = max_t(unsigned,
+                             ilog2(b->c->btree_pages),
+                             btree_order(k));
+
+       t->data = (void *) __get_free_pages(gfp, b->page_order);
+       if (!t->data)
+               goto err;
+
+       t->tree = bset_tree_bytes(b) < PAGE_SIZE
+               ? kmalloc(bset_tree_bytes(b), gfp)
+               : (void *) __get_free_pages(gfp, get_order(bset_tree_bytes(b)));
+       if (!t->tree)
+               goto err;
+
+       t->prev = bset_prev_bytes(b) < PAGE_SIZE
+               ? kmalloc(bset_prev_bytes(b), gfp)
+               : (void *) __get_free_pages(gfp, get_order(bset_prev_bytes(b)));
+       if (!t->prev)
+               goto err;
+
+       list_move(&b->list, &b->c->btree_cache);
+       b->c->bucket_cache_used++;
+       return;
+err:
+       mca_data_free(b);
+}
+
+static struct btree *mca_bucket_alloc(struct cache_set *c,
+                                     struct bkey *k, gfp_t gfp)
+{
+       struct btree *b = kzalloc(sizeof(struct btree), gfp);
+       if (!b)
+               return NULL;
+
+       init_rwsem(&b->lock);
+       lockdep_set_novalidate_class(&b->lock);
+       INIT_LIST_HEAD(&b->list);
+       INIT_DELAYED_WORK(&b->work, btree_write_work);
+       b->c = c;
+       closure_init_unlocked(&b->io);
+
+       mca_data_alloc(b, k, gfp);
+       return b;
+}
+
+static int mca_reap(struct btree *b, struct closure *cl, unsigned min_order)
+{
+       lockdep_assert_held(&b->c->bucket_lock);
+
+       if (!down_write_trylock(&b->lock))
+               return -ENOMEM;
+
+       if (b->page_order < min_order) {
+               rw_unlock(true, b);
+               return -ENOMEM;
+       }
+
+       BUG_ON(btree_node_dirty(b) && !b->sets[0].data);
+
+       if (cl && btree_node_dirty(b))
+               bch_btree_write(b, true, NULL);
+
+       if (cl)
+               closure_wait_event_async(&b->io.wait, cl,
+                        atomic_read(&b->io.cl.remaining) == -1);
+
+       if (btree_node_dirty(b) ||
+           !closure_is_unlocked(&b->io.cl) ||
+           work_pending(&b->work.work)) {
+               rw_unlock(true, b);
+               return -EAGAIN;
+       }
+
+       return 0;
+}
+
+static int bch_mca_shrink(struct shrinker *shrink, struct shrink_control *sc)
+{
+       struct cache_set *c = container_of(shrink, struct cache_set, shrink);
+       struct btree *b, *t;
+       unsigned long i, nr = sc->nr_to_scan;
+
+       if (c->shrinker_disabled)
+               return 0;
+
+       if (c->try_harder)
+               return 0;
+
+       /*
+        * If nr == 0, we're supposed to return the number of items we have
+        * cached. Not allowed to return -1.
+        */
+       if (!nr)
+               return mca_can_free(c) * c->btree_pages;
+
+       /* Return -1 if we can't do anything right now */
+       if (sc->gfp_mask & __GFP_WAIT)
+               mutex_lock(&c->bucket_lock);
+       else if (!mutex_trylock(&c->bucket_lock))
+               return -1;
+
+       nr /= c->btree_pages;
+       nr = min_t(unsigned long, nr, mca_can_free(c));
+
+       i = 0;
+       list_for_each_entry_safe(b, t, &c->btree_cache_freeable, list) {
+               if (!nr)
+                       break;
+
+               if (++i > 3 &&
+                   !mca_reap(b, NULL, 0)) {
+                       mca_data_free(b);
+                       rw_unlock(true, b);
+                       --nr;
+               }
+       }
+
+       /*
+        * Can happen right when we first start up, before we've read in any
+        * btree nodes
+        */
+       if (list_empty(&c->btree_cache))
+               goto out;
+
+       for (i = 0; nr && i < c->bucket_cache_used; i++) {
+               b = list_first_entry(&c->btree_cache, struct btree, list);
+               list_rotate_left(&c->btree_cache);
+
+               if (!b->accessed &&
+                   !mca_reap(b, NULL, 0)) {
+                       mca_bucket_free(b);
+                       mca_data_free(b);
+                       rw_unlock(true, b);
+                       --nr;
+               } else
+                       b->accessed = 0;
+       }
+out:
+       nr = mca_can_free(c) * c->btree_pages;
+       mutex_unlock(&c->bucket_lock);
+       return nr;
+}
+
+void bch_btree_cache_free(struct cache_set *c)
+{
+       struct btree *b;
+       struct closure cl;
+       closure_init_stack(&cl);
+
+       if (c->shrink.list.next)
+               unregister_shrinker(&c->shrink);
+
+       mutex_lock(&c->bucket_lock);
+
+#ifdef CONFIG_BCACHE_DEBUG
+       if (c->verify_data)
+               list_move(&c->verify_data->list, &c->btree_cache);
+#endif
+
+       list_splice(&c->btree_cache_freeable,
+                   &c->btree_cache);
+
+       while (!list_empty(&c->btree_cache)) {
+               b = list_first_entry(&c->btree_cache, struct btree, list);
+
+               if (btree_node_dirty(b))
+                       btree_complete_write(b, btree_current_write(b));
+               clear_bit(BTREE_NODE_dirty, &b->flags);
+
+               mca_data_free(b);
+       }
+
+       while (!list_empty(&c->btree_cache_freed)) {
+               b = list_first_entry(&c->btree_cache_freed,
+                                    struct btree, list);
+               list_del(&b->list);
+               cancel_delayed_work_sync(&b->work);
+               kfree(b);
+       }
+
+       mutex_unlock(&c->bucket_lock);
+}
+
+int bch_btree_cache_alloc(struct cache_set *c)
+{
+       unsigned i;
+
+       /* XXX: doesn't check for errors */
+
+       closure_init_unlocked(&c->gc);
+
+       for (i = 0; i < mca_reserve(c); i++)
+               mca_bucket_alloc(c, &ZERO_KEY, GFP_KERNEL);
+
+       list_splice_init(&c->btree_cache,
+                        &c->btree_cache_freeable);
+
+#ifdef CONFIG_BCACHE_DEBUG
+       mutex_init(&c->verify_lock);
+
+       c->verify_data = mca_bucket_alloc(c, &ZERO_KEY, GFP_KERNEL);
+
+       if (c->verify_data &&
+           c->verify_data->sets[0].data)
+               list_del_init(&c->verify_data->list);
+       else
+               c->verify_data = NULL;
+#endif
+
+       c->shrink.shrink = bch_mca_shrink;
+       c->shrink.seeks = 4;
+       c->shrink.batch = c->btree_pages * 2;
+       register_shrinker(&c->shrink);
+
+       return 0;
+}
+
+/* Btree in memory cache - hash table */
+
+static struct hlist_head *mca_hash(struct cache_set *c, struct bkey *k)
+{
+       return &c->bucket_hash[hash_32(PTR_HASH(c, k), BUCKET_HASH_BITS)];
+}
+
+static struct btree *mca_find(struct cache_set *c, struct bkey *k)
+{
+       struct btree *b;
+
+       rcu_read_lock();
+       hlist_for_each_entry_rcu(b, mca_hash(c, k), hash)
+               if (PTR_HASH(c, &b->key) == PTR_HASH(c, k))
+                       goto out;
+       b = NULL;
+out:
+       rcu_read_unlock();
+       return b;
+}
+
+static struct btree *mca_cannibalize(struct cache_set *c, struct bkey *k,
+                                    int level, struct closure *cl)
+{
+       int ret = -ENOMEM;
+       struct btree *i;
+
+       if (!cl)
+               return ERR_PTR(-ENOMEM);
+
+       /*
+        * Trying to free up some memory - i.e. reuse some btree nodes - may
+        * require initiating IO to flush the dirty part of the node. If we're
+        * running under generic_make_request(), that IO will never finish and
+        * we would deadlock. Returning -EAGAIN causes the cache lookup code to
+        * punt to workqueue and retry.
+        */
+       if (current->bio_list)
+               return ERR_PTR(-EAGAIN);
+
+       if (c->try_harder && c->try_harder != cl) {
+               closure_wait_event_async(&c->try_wait, cl, !c->try_harder);
+               return ERR_PTR(-EAGAIN);
+       }
+
+       /* XXX: tracepoint */
+       c->try_harder = cl;
+       c->try_harder_start = local_clock();
+retry:
+       list_for_each_entry_reverse(i, &c->btree_cache, list) {
+               int r = mca_reap(i, cl, btree_order(k));
+               if (!r)
+                       return i;
+               if (r != -ENOMEM)
+                       ret = r;
+       }
+
+       if (ret == -EAGAIN &&
+           closure_blocking(cl)) {
+               mutex_unlock(&c->bucket_lock);
+               closure_sync(cl);
+               mutex_lock(&c->bucket_lock);
+               goto retry;
+       }
+
+       return ERR_PTR(ret);
+}
+
+/*
+ * We can only have one thread cannibalizing other cached btree nodes at a time,
+ * or we'll deadlock. We use an open coded mutex to ensure that, which a
+ * cannibalize_bucket() will take. This means every time we unlock the root of
+ * the btree, we need to release this lock if we have it held.
+ */
+void bch_cannibalize_unlock(struct cache_set *c, struct closure *cl)
+{
+       if (c->try_harder == cl) {
+               bch_time_stats_update(&c->try_harder_time, c->try_harder_start);
+               c->try_harder = NULL;
+               __closure_wake_up(&c->try_wait);
+       }
+}
+
+static struct btree *mca_alloc(struct cache_set *c, struct bkey *k,
+                              int level, struct closure *cl)
+{
+       struct btree *b;
+
+       lockdep_assert_held(&c->bucket_lock);
+
+       if (mca_find(c, k))
+               return NULL;
+
+       /* btree_free() doesn't free memory; it sticks the node on the end of
+        * the list. Check if there's any freed nodes there:
+        */
+       list_for_each_entry(b, &c->btree_cache_freeable, list)
+               if (!mca_reap(b, NULL, btree_order(k)))
+                       goto out;
+
+       /* We never free struct btree itself, just the memory that holds the on
+        * disk node. Check the freed list before allocating a new one:
+        */
+       list_for_each_entry(b, &c->btree_cache_freed, list)
+               if (!mca_reap(b, NULL, 0)) {
+                       mca_data_alloc(b, k, __GFP_NOWARN|GFP_NOIO);
+                       if (!b->sets[0].data)
+                               goto err;
+                       else
+                               goto out;
+               }
+
+       b = mca_bucket_alloc(c, k, __GFP_NOWARN|GFP_NOIO);
+       if (!b)
+               goto err;
+
+       BUG_ON(!down_write_trylock(&b->lock));
+       if (!b->sets->data)
+               goto err;
+out:
+       BUG_ON(!closure_is_unlocked(&b->io.cl));
+
+       bkey_copy(&b->key, k);
+       list_move(&b->list, &c->btree_cache);
+       hlist_del_init_rcu(&b->hash);
+       hlist_add_head_rcu(&b->hash, mca_hash(c, k));
+
+       lock_set_subclass(&b->lock.dep_map, level + 1, _THIS_IP_);
+       b->level        = level;
+
+       mca_reinit(b);
+
+       return b;
+err:
+       if (b)
+               rw_unlock(true, b);
+
+       b = mca_cannibalize(c, k, level, cl);
+       if (!IS_ERR(b))
+               goto out;
+
+       return b;
+}
+
+/**
+ * bch_btree_node_get - find a btree node in the cache and lock it, reading it
+ * in from disk if necessary.
+ *
+ * If IO is necessary, it uses the closure embedded in struct btree_op to wait;
+ * if that closure is in non blocking mode, will return -EAGAIN.
+ *
+ * The btree node will have either a read or a write lock held, depending on
+ * level and op->lock.
+ */
+struct btree *bch_btree_node_get(struct cache_set *c, struct bkey *k,
+                                int level, struct btree_op *op)
+{
+       int i = 0;
+       bool write = level <= op->lock;
+       struct btree *b;
+
+       BUG_ON(level < 0);
+retry:
+       b = mca_find(c, k);
+
+       if (!b) {
+               mutex_lock(&c->bucket_lock);
+               b = mca_alloc(c, k, level, &op->cl);
+               mutex_unlock(&c->bucket_lock);
+
+               if (!b)
+                       goto retry;
+               if (IS_ERR(b))
+                       return b;
+
+               bch_btree_read(b);
+
+               if (!write)
+                       downgrade_write(&b->lock);
+       } else {
+               rw_lock(write, b, level);
+               if (PTR_HASH(c, &b->key) != PTR_HASH(c, k)) {
+                       rw_unlock(write, b);
+                       goto retry;
+               }
+               BUG_ON(b->level != level);
+       }
+
+       b->accessed = 1;
+
+       for (; i <= b->nsets && b->sets[i].size; i++) {
+               prefetch(b->sets[i].tree);
+               prefetch(b->sets[i].data);
+       }
+
+       for (; i <= b->nsets; i++)
+               prefetch(b->sets[i].data);
+
+       if (!closure_wait_event(&b->io.wait, &op->cl,
+                               btree_node_read_done(b))) {
+               rw_unlock(write, b);
+               b = ERR_PTR(-EAGAIN);
+       } else if (btree_node_io_error(b)) {
+               rw_unlock(write, b);
+               b = ERR_PTR(-EIO);
+       } else
+               BUG_ON(!b->written);
+
+       return b;
+}
+
+static void btree_node_prefetch(struct cache_set *c, struct bkey *k, int level)
+{
+       struct btree *b;
+
+       mutex_lock(&c->bucket_lock);
+       b = mca_alloc(c, k, level, NULL);
+       mutex_unlock(&c->bucket_lock);
+
+       if (!IS_ERR_OR_NULL(b)) {
+               bch_btree_read(b);
+               rw_unlock(true, b);
+       }
+}
+
+/* Btree alloc */
+
+static void btree_node_free(struct btree *b, struct btree_op *op)
+{
+       unsigned i;
+
+       /*
+        * The BUG_ON() in btree_node_get() implies that we must have a write
+        * lock on parent to free or even invalidate a node
+        */
+       BUG_ON(op->lock <= b->level);
+       BUG_ON(b == b->c->root);
+       pr_debug("bucket %s", pbtree(b));
+
+       if (btree_node_dirty(b))
+               btree_complete_write(b, btree_current_write(b));
+       clear_bit(BTREE_NODE_dirty, &b->flags);
+
+       if (b->prio_blocked &&
+           !atomic_sub_return(b->prio_blocked, &b->c->prio_blocked))
+               wake_up(&b->c->alloc_wait);
+
+       b->prio_blocked = 0;
+
+       cancel_delayed_work(&b->work);
+
+       mutex_lock(&b->c->bucket_lock);
+
+       for (i = 0; i < KEY_PTRS(&b->key); i++) {
+               BUG_ON(atomic_read(&PTR_BUCKET(b->c, &b->key, i)->pin));
+
+               bch_inc_gen(PTR_CACHE(b->c, &b->key, i),
+                           PTR_BUCKET(b->c, &b->key, i));
+       }
+
+       bch_bucket_free(b->c, &b->key);
+       mca_bucket_free(b);
+       mutex_unlock(&b->c->bucket_lock);
+}
+
+struct btree *bch_btree_node_alloc(struct cache_set *c, int level,
+                                  struct closure *cl)
+{
+       BKEY_PADDED(key) k;
+       struct btree *b = ERR_PTR(-EAGAIN);
+
+       mutex_lock(&c->bucket_lock);
+retry:
+       if (__bch_bucket_alloc_set(c, WATERMARK_METADATA, &k.key, 1, cl))
+               goto err;
+
+       SET_KEY_SIZE(&k.key, c->btree_pages * PAGE_SECTORS);
+
+       b = mca_alloc(c, &k.key, level, cl);
+       if (IS_ERR(b))
+               goto err_free;
+
+       if (!b) {
+               cache_bug(c,
+                       "Tried to allocate bucket that was in btree cache");
+               __bkey_put(c, &k.key);
+               goto retry;
+       }
+
+       set_btree_node_read_done(b);
+       b->accessed = 1;
+       bch_bset_init_next(b);
+
+       mutex_unlock(&c->bucket_lock);
+       return b;
+err_free:
+       bch_bucket_free(c, &k.key);
+       __bkey_put(c, &k.key);
+err:
+       mutex_unlock(&c->bucket_lock);
+       return b;
+}
+
+static struct btree *btree_node_alloc_replacement(struct btree *b,
+                                                 struct closure *cl)
+{
+       struct btree *n = bch_btree_node_alloc(b->c, b->level, cl);
+       if (!IS_ERR_OR_NULL(n))
+               bch_btree_sort_into(b, n);
+
+       return n;
+}
+
+/* Garbage collection */
+
+uint8_t __bch_btree_mark_key(struct cache_set *c, int level, struct bkey *k)
+{
+       uint8_t stale = 0;
+       unsigned i;
+       struct bucket *g;
+
+       /*
+        * ptr_invalid() can't return true for the keys that mark btree nodes as
+        * freed, but since ptr_bad() returns true we'll never actually use them
+        * for anything and thus we don't want mark their pointers here
+        */
+       if (!bkey_cmp(k, &ZERO_KEY))
+               return stale;
+
+       for (i = 0; i < KEY_PTRS(k); i++) {
+               if (!ptr_available(c, k, i))
+                       continue;
+
+               g = PTR_BUCKET(c, k, i);
+
+               if (gen_after(g->gc_gen, PTR_GEN(k, i)))
+                       g->gc_gen = PTR_GEN(k, i);
+
+               if (ptr_stale(c, k, i)) {
+                       stale = max(stale, ptr_stale(c, k, i));
+                       continue;
+               }
+
+               cache_bug_on(GC_MARK(g) &&
+                            (GC_MARK(g) == GC_MARK_METADATA) != (level != 0),
+                            c, "inconsistent ptrs: mark = %llu, level = %i",
+                            GC_MARK(g), level);
+
+               if (level)
+                       SET_GC_MARK(g, GC_MARK_METADATA);
+               else if (KEY_DIRTY(k))
+                       SET_GC_MARK(g, GC_MARK_DIRTY);
+
+               /* guard against overflow */
+               SET_GC_SECTORS_USED(g, min_t(unsigned,
+                                            GC_SECTORS_USED(g) + KEY_SIZE(k),
+                                            (1 << 14) - 1));
+
+               BUG_ON(!GC_SECTORS_USED(g));
+       }
+
+       return stale;
+}
+
+#define btree_mark_key(b, k)   __bch_btree_mark_key(b->c, b->level, k)
+
+static int btree_gc_mark_node(struct btree *b, unsigned *keys,
+                             struct gc_stat *gc)
+{
+       uint8_t stale = 0;
+       unsigned last_dev = -1;
+       struct bcache_device *d = NULL;
+       struct bkey *k;
+       struct btree_iter iter;
+       struct bset_tree *t;
+
+       gc->nodes++;
+
+       for_each_key_filter(b, k, &iter, bch_ptr_invalid) {
+               if (last_dev != KEY_INODE(k)) {
+                       last_dev = KEY_INODE(k);
+
+                       d = KEY_INODE(k) < b->c->nr_uuids
+                               ? b->c->devices[last_dev]
+                               : NULL;
+               }
+
+               stale = max(stale, btree_mark_key(b, k));
+
+               if (bch_ptr_bad(b, k))
+                       continue;
+
+               *keys += bkey_u64s(k);
+
+               gc->key_bytes += bkey_u64s(k);
+               gc->nkeys++;
+
+               gc->data += KEY_SIZE(k);
+               if (KEY_DIRTY(k)) {
+                       gc->dirty += KEY_SIZE(k);
+                       if (d)
+                               d->sectors_dirty_gc += KEY_SIZE(k);
+               }
+       }
+
+       for (t = b->sets; t <= &b->sets[b->nsets]; t++)
+               btree_bug_on(t->size &&
+                            bset_written(b, t) &&
+                            bkey_cmp(&b->key, &t->end) < 0,
+                            b, "found short btree key in gc");
+
+       return stale;
+}
+
+static struct btree *btree_gc_alloc(struct btree *b, struct bkey *k,
+                                   struct btree_op *op)
+{
+       /*
+        * We block priorities from being written for the duration of garbage
+        * collection, so we can't sleep in btree_alloc() ->
+        * bch_bucket_alloc_set(), or we'd risk deadlock - so we don't pass it
+        * our closure.
+        */
+       struct btree *n = btree_node_alloc_replacement(b, NULL);
+
+       if (!IS_ERR_OR_NULL(n)) {
+               swap(b, n);
+
+               memcpy(k->ptr, b->key.ptr,
+                      sizeof(uint64_t) * KEY_PTRS(&b->key));
+
+               __bkey_put(b->c, &b->key);
+               atomic_inc(&b->c->prio_blocked);
+               b->prio_blocked++;
+
+               btree_node_free(n, op);
+               up_write(&n->lock);
+       }
+
+       return b;
+}
+
+/*
+ * Leaving this at 2 until we've got incremental garbage collection done; it
+ * could be higher (and has been tested with 4) except that garbage collection
+ * could take much longer, adversely affecting latency.
+ */
+#define GC_MERGE_NODES 2U
+
+struct gc_merge_info {
+       struct btree    *b;
+       struct bkey     *k;
+       unsigned        keys;
+};
+
+static void btree_gc_coalesce(struct btree *b, struct btree_op *op,
+                             struct gc_stat *gc, struct gc_merge_info *r)
+{
+       unsigned nodes = 0, keys = 0, blocks;
+       int i;
+
+       while (nodes < GC_MERGE_NODES && r[nodes].b)
+               keys += r[nodes++].keys;
+
+       blocks = btree_default_blocks(b->c) * 2 / 3;
+
+       if (nodes < 2 ||
+           __set_blocks(b->sets[0].data, keys, b->c) > blocks * (nodes - 1))
+               return;
+
+       for (i = nodes - 1; i >= 0; --i) {
+               if (r[i].b->written)
+                       r[i].b = btree_gc_alloc(r[i].b, r[i].k, op);
+
+               if (r[i].b->written)
+                       return;
+       }
+
+       for (i = nodes - 1; i > 0; --i) {
+               struct bset *n1 = r[i].b->sets->data;
+               struct bset *n2 = r[i - 1].b->sets->data;
+               struct bkey *k, *last = NULL;
+
+               keys = 0;
+
+               if (i == 1) {
+                       /*
+                        * Last node we're not getting rid of - we're getting
+                        * rid of the node at r[0]. Have to try and fit all of
+                        * the remaining keys into this node; we can't ensure
+                        * they will always fit due to rounding and variable
+                        * length keys (shouldn't be possible in practice,
+                        * though)
+                        */
+                       if (__set_blocks(n1, n1->keys + r->keys,
+                                        b->c) > btree_blocks(r[i].b))
+                               return;
+
+                       keys = n2->keys;
+                       last = &r->b->key;
+               } else
+                       for (k = n2->start;
+                            k < end(n2);
+                            k = bkey_next(k)) {
+                               if (__set_blocks(n1, n1->keys + keys +
+                                                bkey_u64s(k), b->c) > blocks)
+                                       break;
+
+                               last = k;
+                               keys += bkey_u64s(k);
+                       }
+
+               BUG_ON(__set_blocks(n1, n1->keys + keys,
+                                   b->c) > btree_blocks(r[i].b));
+
+               if (last) {
+                       bkey_copy_key(&r[i].b->key, last);
+                       bkey_copy_key(r[i].k, last);
+               }
+
+               memcpy(end(n1),
+                      n2->start,
+                      (void *) node(n2, keys) - (void *) n2->start);
+
+               n1->keys += keys;
+
+               memmove(n2->start,
+                       node(n2, keys),
+                       (void *) end(n2) - (void *) node(n2, keys));
+
+               n2->keys -= keys;
+
+               r[i].keys       = n1->keys;
+               r[i - 1].keys   = n2->keys;
+       }
+
+       btree_node_free(r->b, op);
+       up_write(&r->b->lock);
+
+       pr_debug("coalesced %u nodes", nodes);
+
+       gc->nodes--;
+       nodes--;
+
+       memmove(&r[0], &r[1], sizeof(struct gc_merge_info) * nodes);
+       memset(&r[nodes], 0, sizeof(struct gc_merge_info));
+}
+
+static int btree_gc_recurse(struct btree *b, struct btree_op *op,
+                           struct closure *writes, struct gc_stat *gc)
+{
+       void write(struct btree *r)
+       {
+               if (!r->written)
+                       bch_btree_write(r, true, op);
+               else if (btree_node_dirty(r)) {
+                       BUG_ON(btree_current_write(r)->owner);
+                       btree_current_write(r)->owner = writes;
+                       closure_get(writes);
+
+                       bch_btree_write(r, true, NULL);
+               }
+
+               up_write(&r->lock);
+       }
+
+       int ret = 0, stale;
+       unsigned i;
+       struct gc_merge_info r[GC_MERGE_NODES];
+
+       memset(r, 0, sizeof(r));
+
+       while ((r->k = bch_next_recurse_key(b, &b->c->gc_done))) {
+               r->b = bch_btree_node_get(b->c, r->k, b->level - 1, op);
+
+               if (IS_ERR(r->b)) {
+                       ret = PTR_ERR(r->b);
+                       break;
+               }
+
+               r->keys = 0;
+               stale = btree_gc_mark_node(r->b, &r->keys, gc);
+
+               if (!b->written &&
+                   (r->b->level || stale > 10 ||
+                    b->c->gc_always_rewrite))
+                       r->b = btree_gc_alloc(r->b, r->k, op);
+
+               if (r->b->level)
+                       ret = btree_gc_recurse(r->b, op, writes, gc);
+
+               if (ret) {
+                       write(r->b);
+                       break;
+               }
+
+               bkey_copy_key(&b->c->gc_done, r->k);
+
+               if (!b->written)
+                       btree_gc_coalesce(b, op, gc, r);
+
+               if (r[GC_MERGE_NODES - 1].b)
+                       write(r[GC_MERGE_NODES - 1].b);
+
+               memmove(&r[1], &r[0],
+                       sizeof(struct gc_merge_info) * (GC_MERGE_NODES - 1));
+
+               /* When we've got incremental GC working, we'll want to do
+                * if (should_resched())
+                *      return -EAGAIN;
+                */
+               cond_resched();
+#if 0
+               if (need_resched()) {
+                       ret = -EAGAIN;
+                       break;
+               }
+#endif
+       }
+
+       for (i = 1; i < GC_MERGE_NODES && r[i].b; i++)
+               write(r[i].b);
+
+       /* Might have freed some children, must remove their keys */
+       if (!b->written)
+               bch_btree_sort(b);
+
+       return ret;
+}
+
+static int bch_btree_gc_root(struct btree *b, struct btree_op *op,
+                            struct closure *writes, struct gc_stat *gc)
+{
+       struct btree *n = NULL;
+       unsigned keys = 0;
+       int ret = 0, stale = btree_gc_mark_node(b, &keys, gc);
+
+       if (b->level || stale > 10)
+               n = btree_node_alloc_replacement(b, NULL);
+
+       if (!IS_ERR_OR_NULL(n))
+               swap(b, n);
+
+       if (b->level)
+               ret = btree_gc_recurse(b, op, writes, gc);
+
+       if (!b->written || btree_node_dirty(b)) {
+               atomic_inc(&b->c->prio_blocked);
+               b->prio_blocked++;
+               bch_btree_write(b, true, n ? op : NULL);
+       }
+
+       if (!IS_ERR_OR_NULL(n)) {
+               closure_sync(&op->cl);
+               bch_btree_set_root(b);
+               btree_node_free(n, op);
+               rw_unlock(true, b);
+       }
+
+       return ret;
+}
+
+static void btree_gc_start(struct cache_set *c)
+{
+       struct cache *ca;
+       struct bucket *b;
+       struct bcache_device **d;
+       unsigned i;
+
+       if (!c->gc_mark_valid)
+               return;
+
+       mutex_lock(&c->bucket_lock);
+
+       c->gc_mark_valid = 0;
+       c->gc_done = ZERO_KEY;
+
+       for_each_cache(ca, c, i)
+               for_each_bucket(b, ca) {
+                       b->gc_gen = b->gen;
+                       if (!atomic_read(&b->pin))
+                               SET_GC_MARK(b, GC_MARK_RECLAIMABLE);
+               }
+
+       for (d = c->devices;
+            d < c->devices + c->nr_uuids;
+            d++)
+               if (*d)
+                       (*d)->sectors_dirty_gc = 0;
+
+       mutex_unlock(&c->bucket_lock);
+}
+
+size_t bch_btree_gc_finish(struct cache_set *c)
+{
+       size_t available = 0;
+       struct bucket *b;
+       struct cache *ca;
+       struct bcache_device **d;
+       unsigned i;
+
+       mutex_lock(&c->bucket_lock);
+
+       set_gc_sectors(c);
+       c->gc_mark_valid = 1;
+       c->need_gc      = 0;
+
+       if (c->root)
+               for (i = 0; i < KEY_PTRS(&c->root->key); i++)
+                       SET_GC_MARK(PTR_BUCKET(c, &c->root->key, i),
+                                   GC_MARK_METADATA);
+
+       for (i = 0; i < KEY_PTRS(&c->uuid_bucket); i++)
+               SET_GC_MARK(PTR_BUCKET(c, &c->uuid_bucket, i),
+                           GC_MARK_METADATA);
+
+       for_each_cache(ca, c, i) {
+               uint64_t *i;
+
+               ca->invalidate_needs_gc = 0;
+
+               for (i = ca->sb.d; i < ca->sb.d + ca->sb.keys; i++)
+                       SET_GC_MARK(ca->buckets + *i, GC_MARK_METADATA);
+
+               for (i = ca->prio_buckets;
+                    i < ca->prio_buckets + prio_buckets(ca) * 2; i++)
+                       SET_GC_MARK(ca->buckets + *i, GC_MARK_METADATA);
+
+               for_each_bucket(b, ca) {
+                       b->last_gc      = b->gc_gen;
+                       c->need_gc      = max(c->need_gc, bucket_gc_gen(b));
+
+                       if (!atomic_read(&b->pin) &&
+                           GC_MARK(b) == GC_MARK_RECLAIMABLE) {
+                               available++;
+                               if (!GC_SECTORS_USED(b))
+                                       bch_bucket_add_unused(ca, b);
+                       }
+               }
+       }
+
+       for (d = c->devices;
+            d < c->devices + c->nr_uuids;
+            d++)
+               if (*d) {
+                       unsigned long last =
+                               atomic_long_read(&((*d)->sectors_dirty));
+                       long difference = (*d)->sectors_dirty_gc - last;
+
+                       pr_debug("sectors dirty off by %li", difference);
+
+                       (*d)->sectors_dirty_last += difference;
+
+                       atomic_long_set(&((*d)->sectors_dirty),
+                                       (*d)->sectors_dirty_gc);
+               }
+
+       mutex_unlock(&c->bucket_lock);
+       return available;
+}
+
+static void bch_btree_gc(struct closure *cl)
+{
+       struct cache_set *c = container_of(cl, struct cache_set, gc.cl);
+       int ret;
+       unsigned long available;
+       struct gc_stat stats;
+       struct closure writes;
+       struct btree_op op;
+
+       uint64_t start_time = local_clock();
+       trace_bcache_gc_start(c->sb.set_uuid);
+       blktrace_msg_all(c, "Starting gc");
+
+       memset(&stats, 0, sizeof(struct gc_stat));
+       closure_init_stack(&writes);
+       bch_btree_op_init_stack(&op);
+       op.lock = SHRT_MAX;
+
+       btree_gc_start(c);
+
+       ret = btree_root(gc_root, c, &op, &writes, &stats);
+       closure_sync(&op.cl);
+       closure_sync(&writes);
+
+       if (ret) {
+               blktrace_msg_all(c, "Stopped gc");
+               pr_warn("gc failed!");
+
+               continue_at(cl, bch_btree_gc, bch_gc_wq);
+       }
+
+       /* Possibly wait for new UUIDs or whatever to hit disk */
+       bch_journal_meta(c, &op.cl);
+       closure_sync(&op.cl);
+
+       available = bch_btree_gc_finish(c);
+
+       bch_time_stats_update(&c->btree_gc_time, start_time);
+
+       stats.key_bytes *= sizeof(uint64_t);
+       stats.dirty     <<= 9;
+       stats.data      <<= 9;
+       stats.in_use    = (c->nbuckets - available) * 100 / c->nbuckets;
+       memcpy(&c->gc_stats, &stats, sizeof(struct gc_stat));
+       blktrace_msg_all(c, "Finished gc");
+
+       trace_bcache_gc_end(c->sb.set_uuid);
+       wake_up(&c->alloc_wait);
+
+       continue_at(cl, bch_moving_gc, bch_gc_wq);
+}
+
+void bch_queue_gc(struct cache_set *c)
+{
+       closure_trylock_call(&c->gc.cl, bch_btree_gc, bch_gc_wq, &c->cl);
+}
+
+/* Initial partial gc */
+
+static int bch_btree_check_recurse(struct btree *b, struct btree_op *op,
+                                  unsigned long **seen)
+{
+       int ret;
+       unsigned i;
+       struct bkey *k;
+       struct bucket *g;
+       struct btree_iter iter;
+
+       for_each_key_filter(b, k, &iter, bch_ptr_invalid) {
+               for (i = 0; i < KEY_PTRS(k); i++) {
+                       if (!ptr_available(b->c, k, i))
+                               continue;
+
+                       g = PTR_BUCKET(b->c, k, i);
+
+                       if (!__test_and_set_bit(PTR_BUCKET_NR(b->c, k, i),
+                                               seen[PTR_DEV(k, i)]) ||
+                           !ptr_stale(b->c, k, i)) {
+                               g->gen = PTR_GEN(k, i);
+
+                               if (b->level)
+                                       g->prio = BTREE_PRIO;
+                               else if (g->prio == BTREE_PRIO)
+                                       g->prio = INITIAL_PRIO;
+                       }
+               }
+
+               btree_mark_key(b, k);
+       }
+
+       if (b->level) {
+               k = bch_next_recurse_key(b, &ZERO_KEY);
+
+               while (k) {
+                       struct bkey *p = bch_next_recurse_key(b, k);
+                       if (p)
+                               btree_node_prefetch(b->c, p, b->level - 1);
+
+                       ret = btree(check_recurse, k, b, op, seen);
+                       if (ret)
+                               return ret;
+
+                       k = p;
+               }
+       }
+
+       return 0;
+}
+
+int bch_btree_check(struct cache_set *c, struct btree_op *op)
+{
+       int ret = -ENOMEM;
+       unsigned i;
+       unsigned long *seen[MAX_CACHES_PER_SET];
+
+       memset(seen, 0, sizeof(seen));
+
+       for (i = 0; c->cache[i]; i++) {
+               size_t n = DIV_ROUND_UP(c->cache[i]->sb.nbuckets, 8);
+               seen[i] = kmalloc(n, GFP_KERNEL);
+               if (!seen[i])
+                       goto err;
+
+               /* Disables the seen array until prio_read() uses it too */
+               memset(seen[i], 0xFF, n);
+       }
+
+       ret = btree_root(check_recurse, c, op, seen);
+err:
+       for (i = 0; i < MAX_CACHES_PER_SET; i++)
+               kfree(seen[i]);
+       return ret;
+}
+
+/* Btree insertion */
+
+static void shift_keys(struct btree *b, struct bkey *where, struct bkey *insert)
+{
+       struct bset *i = b->sets[b->nsets].data;
+
+       memmove((uint64_t *) where + bkey_u64s(insert),
+               where,
+               (void *) end(i) - (void *) where);
+
+       i->keys += bkey_u64s(insert);
+       bkey_copy(where, insert);
+       bch_bset_fix_lookup_table(b, where);
+}
+
+static bool fix_overlapping_extents(struct btree *b,
+                                   struct bkey *insert,
+                                   struct btree_iter *iter,
+                                   struct btree_op *op)
+{
+       void subtract_dirty(struct bkey *k, int sectors)
+       {
+               struct bcache_device *d = b->c->devices[KEY_INODE(k)];
+
+               if (KEY_DIRTY(k) && d)
+                       atomic_long_sub(sectors, &d->sectors_dirty);
+       }
+
+       unsigned old_size, sectors_found = 0;
+
+       while (1) {
+               struct bkey *k = bch_btree_iter_next(iter);
+               if (!k ||
+                   bkey_cmp(&START_KEY(k), insert) >= 0)
+                       break;
+
+               if (bkey_cmp(k, &START_KEY(insert)) <= 0)
+                       continue;
+
+               old_size = KEY_SIZE(k);
+
+               /*
+                * We might overlap with 0 size extents; we can't skip these
+                * because if they're in the set we're inserting to we have to
+                * adjust them so they don't overlap with the key we're
+                * inserting. But we don't want to check them for BTREE_REPLACE
+                * operations.
+                */
+
+               if (op->type == BTREE_REPLACE &&
+                   KEY_SIZE(k)) {
+                       /*
+                        * k might have been split since we inserted/found the
+                        * key we're replacing
+                        */
+                       unsigned i;
+                       uint64_t offset = KEY_START(k) -
+                               KEY_START(&op->replace);
+
+                       /* But it must be a subset of the replace key */
+                       if (KEY_START(k) < KEY_START(&op->replace) ||
+                           KEY_OFFSET(k) > KEY_OFFSET(&op->replace))
+                               goto check_failed;
+
+                       /* We didn't find a key that we were supposed to */
+                       if (KEY_START(k) > KEY_START(insert) + sectors_found)
+                               goto check_failed;
+
+                       if (KEY_PTRS(&op->replace) != KEY_PTRS(k))
+                               goto check_failed;
+
+                       /* skip past gen */
+                       offset <<= 8;
+
+                       BUG_ON(!KEY_PTRS(&op->replace));
+
+                       for (i = 0; i < KEY_PTRS(&op->replace); i++)
+                               if (k->ptr[i] != op->replace.ptr[i] + offset)
+                                       goto check_failed;
+
+                       sectors_found = KEY_OFFSET(k) - KEY_START(insert);
+               }
+
+               if (bkey_cmp(insert, k) < 0 &&
+                   bkey_cmp(&START_KEY(insert), &START_KEY(k)) > 0) {
+                       /*
+                        * We overlapped in the middle of an existing key: that
+                        * means we have to split the old key. But we have to do
+                        * slightly different things depending on whether the
+                        * old key has been written out yet.
+                        */
+
+                       struct bkey *top;
+
+                       subtract_dirty(k, KEY_SIZE(insert));
+
+                       if (bkey_written(b, k)) {
+                               /*
+                                * We insert a new key to cover the top of the
+                                * old key, and the old key is modified in place
+                                * to represent the bottom split.
+                                *
+                                * It's completely arbitrary whether the new key
+                                * is the top or the bottom, but it has to match
+                                * up with what btree_sort_fixup() does - it
+                                * doesn't check for this kind of overlap, it
+                                * depends on us inserting a new key for the top
+                                * here.
+                                */
+                               top = bch_bset_search(b, &b->sets[b->nsets],
+                                                     insert);
+                               shift_keys(b, top, k);
+                       } else {
+                               BKEY_PADDED(key) temp;
+                               bkey_copy(&temp.key, k);
+                               shift_keys(b, k, &temp.key);
+                               top = bkey_next(k);
+                       }
+
+                       bch_cut_front(insert, top);
+                       bch_cut_back(&START_KEY(insert), k);
+                       bch_bset_fix_invalidated_key(b, k);
+                       return false;
+               }
+
+               if (bkey_cmp(insert, k) < 0) {
+                       bch_cut_front(insert, k);
+               } else {
+                       if (bkey_written(b, k) &&
+                           bkey_cmp(&START_KEY(insert), &START_KEY(k)) <= 0) {
+                               /*
+                                * Completely overwrote, so we don't have to
+                                * invalidate the binary search tree
+                                */
+                               bch_cut_front(k, k);
+                       } else {
+                               __bch_cut_back(&START_KEY(insert), k);
+                               bch_bset_fix_invalidated_key(b, k);
+                       }
+               }
+
+               subtract_dirty(k, old_size - KEY_SIZE(k));
+       }
+
+check_failed:
+       if (op->type == BTREE_REPLACE) {
+               if (!sectors_found) {
+                       op->insert_collision = true;
+                       return true;
+               } else if (sectors_found < KEY_SIZE(insert)) {
+                       SET_KEY_OFFSET(insert, KEY_OFFSET(insert) -
+                                      (KEY_SIZE(insert) - sectors_found));
+                       SET_KEY_SIZE(insert, sectors_found);
+               }
+       }
+
+       return false;
+}
+
+static bool btree_insert_key(struct btree *b, struct btree_op *op,
+                            struct bkey *k)
+{
+       struct bset *i = b->sets[b->nsets].data;
+       struct bkey *m, *prev;
+       const char *status = "insert";
+
+       BUG_ON(bkey_cmp(k, &b->key) > 0);
+       BUG_ON(b->level && !KEY_PTRS(k));
+       BUG_ON(!b->level && !KEY_OFFSET(k));
+
+       if (!b->level) {
+               struct btree_iter iter;
+               struct bkey search = KEY(KEY_INODE(k), KEY_START(k), 0);
+
+               /*
+                * bset_search() returns the first key that is strictly greater
+                * than the search key - but for back merging, we want to find
+                * the first key that is greater than or equal to KEY_START(k) -
+                * unless KEY_START(k) is 0.
+                */
+               if (KEY_OFFSET(&search))
+                       SET_KEY_OFFSET(&search, KEY_OFFSET(&search) - 1);
+
+               prev = NULL;
+               m = bch_btree_iter_init(b, &iter, &search);
+
+               if (fix_overlapping_extents(b, k, &iter, op))
+                       return false;
+
+               while (m != end(i) &&
+                      bkey_cmp(k, &START_KEY(m)) > 0)
+                       prev = m, m = bkey_next(m);
+
+               if (key_merging_disabled(b->c))
+                       goto insert;
+
+               /* prev is in the tree, if we merge we're done */
+               status = "back merging";
+               if (prev &&
+                   bch_bkey_try_merge(b, prev, k))
+                       goto merged;
+
+               status = "overwrote front";
+               if (m != end(i) &&
+                   KEY_PTRS(m) == KEY_PTRS(k) && !KEY_SIZE(m))
+                       goto copy;
+
+               status = "front merge";
+               if (m != end(i) &&
+                   bch_bkey_try_merge(b, k, m))
+                       goto copy;
+       } else
+               m = bch_bset_search(b, &b->sets[b->nsets], k);
+
+insert:        shift_keys(b, m, k);
+copy:  bkey_copy(m, k);
+merged:
+       bch_check_keys(b, "%s for %s at %s: %s", status,
+                      op_type(op), pbtree(b), pkey(k));
+       bch_check_key_order_msg(b, i, "%s for %s at %s: %s", status,
+                               op_type(op), pbtree(b), pkey(k));
+
+       if (b->level && !KEY_OFFSET(k))
+               b->prio_blocked++;
+
+       pr_debug("%s for %s at %s: %s", status,
+                op_type(op), pbtree(b), pkey(k));
+
+       return true;
+}
+
+bool bch_btree_insert_keys(struct btree *b, struct btree_op *op)
+{
+       bool ret = false;
+       struct bkey *k;
+       unsigned oldsize = bch_count_data(b);
+
+       while ((k = bch_keylist_pop(&op->keys))) {
+               bkey_put(b->c, k, b->level);
+               ret |= btree_insert_key(b, op, k);
+       }
+
+       BUG_ON(bch_count_data(b) < oldsize);
+       return ret;
+}
+
+bool bch_btree_insert_check_key(struct btree *b, struct btree_op *op,
+                                  struct bio *bio)
+{
+       bool ret = false;
+       uint64_t btree_ptr = b->key.ptr[0];
+       unsigned long seq = b->seq;
+       BKEY_PADDED(k) tmp;
+
+       rw_unlock(false, b);
+       rw_lock(true, b, b->level);
+
+       if (b->key.ptr[0] != btree_ptr ||
+           b->seq != seq + 1 ||
+           should_split(b))
+               goto out;
+
+       op->replace = KEY(op->inode, bio_end(bio), bio_sectors(bio));
+
+       SET_KEY_PTRS(&op->replace, 1);
+       get_random_bytes(&op->replace.ptr[0], sizeof(uint64_t));
+
+       SET_PTR_DEV(&op->replace, 0, PTR_CHECK_DEV);
+
+       bkey_copy(&tmp.k, &op->replace);
+
+       BUG_ON(op->type != BTREE_INSERT);
+       BUG_ON(!btree_insert_key(b, op, &tmp.k));
+       bch_btree_write(b, false, NULL);
+       ret = true;
+out:
+       downgrade_write(&b->lock);
+       return ret;
+}
+
+static int btree_split(struct btree *b, struct btree_op *op)
+{
+       bool split, root = b == b->c->root;
+       struct btree *n1, *n2 = NULL, *n3 = NULL;
+       uint64_t start_time = local_clock();
+
+       if (b->level)
+               set_closure_blocking(&op->cl);
+
+       n1 = btree_node_alloc_replacement(b, &op->cl);
+       if (IS_ERR(n1))
+               goto err;
+
+       split = set_blocks(n1->sets[0].data, n1->c) > (btree_blocks(b) * 4) / 5;
+
+       pr_debug("%ssplitting at %s keys %i", split ? "" : "not ",
+                pbtree(b), n1->sets[0].data->keys);
+
+       if (split) {
+               unsigned keys = 0;
+
+               n2 = bch_btree_node_alloc(b->c, b->level, &op->cl);
+               if (IS_ERR(n2))
+                       goto err_free1;
+
+               if (root) {
+                       n3 = bch_btree_node_alloc(b->c, b->level + 1, &op->cl);
+                       if (IS_ERR(n3))
+                               goto err_free2;
+               }
+
+               bch_btree_insert_keys(n1, op);
+
+               /* Has to be a linear search because we don't have an auxiliary
+                * search tree yet
+                */
+
+               while (keys < (n1->sets[0].data->keys * 3) / 5)
+                       keys += bkey_u64s(node(n1->sets[0].data, keys));
+
+               bkey_copy_key(&n1->key, node(n1->sets[0].data, keys));
+               keys += bkey_u64s(node(n1->sets[0].data, keys));
+
+               n2->sets[0].data->keys = n1->sets[0].data->keys - keys;
+               n1->sets[0].data->keys = keys;
+
+               memcpy(n2->sets[0].data->start,
+                      end(n1->sets[0].data),
+                      n2->sets[0].data->keys * sizeof(uint64_t));
+
+               bkey_copy_key(&n2->key, &b->key);
+
+               bch_keylist_add(&op->keys, &n2->key);
+               bch_btree_write(n2, true, op);
+               rw_unlock(true, n2);
+       } else
+               bch_btree_insert_keys(n1, op);
+
+       bch_keylist_add(&op->keys, &n1->key);
+       bch_btree_write(n1, true, op);
+
+       if (n3) {
+               bkey_copy_key(&n3->key, &MAX_KEY);
+               bch_btree_insert_keys(n3, op);
+               bch_btree_write(n3, true, op);
+
+               closure_sync(&op->cl);
+               bch_btree_set_root(n3);
+               rw_unlock(true, n3);
+       } else if (root) {
+               op->keys.top = op->keys.bottom;
+               closure_sync(&op->cl);
+               bch_btree_set_root(n1);
+       } else {
+               unsigned i;
+
+               bkey_copy(op->keys.top, &b->key);
+               bkey_copy_key(op->keys.top, &ZERO_KEY);
+
+               for (i = 0; i < KEY_PTRS(&b->key); i++) {
+                       uint8_t g = PTR_BUCKET(b->c, &b->key, i)->gen + 1;
+
+                       SET_PTR_GEN(op->keys.top, i, g);
+               }
+
+               bch_keylist_push(&op->keys);
+               closure_sync(&op->cl);
+               atomic_inc(&b->c->prio_blocked);
+       }
+
+       rw_unlock(true, n1);
+       btree_node_free(b, op);
+
+       bch_time_stats_update(&b->c->btree_split_time, start_time);
+
+       return 0;
+err_free2:
+       __bkey_put(n2->c, &n2->key);
+       btree_node_free(n2, op);
+       rw_unlock(true, n2);
+err_free1:
+       __bkey_put(n1->c, &n1->key);
+       btree_node_free(n1, op);
+       rw_unlock(true, n1);
+err:
+       if (n3 == ERR_PTR(-EAGAIN) ||
+           n2 == ERR_PTR(-EAGAIN) ||
+           n1 == ERR_PTR(-EAGAIN))
+               return -EAGAIN;
+
+       pr_warn("couldn't split");
+       return -ENOMEM;
+}
+
+static int bch_btree_insert_recurse(struct btree *b, struct btree_op *op,
+                                   struct keylist *stack_keys)
+{
+       if (b->level) {
+               int ret;
+               struct bkey *insert = op->keys.bottom;
+               struct bkey *k = bch_next_recurse_key(b, &START_KEY(insert));
+
+               if (!k) {
+                       btree_bug(b, "no key to recurse on at level %i/%i",
+                                 b->level, b->c->root->level);
+
+                       op->keys.top = op->keys.bottom;
+                       return -EIO;
+               }
+
+               if (bkey_cmp(insert, k) > 0) {
+                       unsigned i;
+
+                       if (op->type == BTREE_REPLACE) {
+                               __bkey_put(b->c, insert);
+                               op->keys.top = op->keys.bottom;
+                               op->insert_collision = true;
+                               return 0;
+                       }
+
+                       for (i = 0; i < KEY_PTRS(insert); i++)
+                               atomic_inc(&PTR_BUCKET(b->c, insert, i)->pin);
+
+                       bkey_copy(stack_keys->top, insert);
+
+                       bch_cut_back(k, insert);
+                       bch_cut_front(k, stack_keys->top);
+
+                       bch_keylist_push(stack_keys);
+               }
+
+               ret = btree(insert_recurse, k, b, op, stack_keys);
+               if (ret)
+                       return ret;
+       }
+
+       if (!bch_keylist_empty(&op->keys)) {
+               if (should_split(b)) {
+                       if (op->lock <= b->c->root->level) {
+                               BUG_ON(b->level);
+                               op->lock = b->c->root->level + 1;
+                               return -EINTR;
+                       }
+                       return btree_split(b, op);
+               }
+
+               BUG_ON(write_block(b) != b->sets[b->nsets].data);
+
+               if (bch_btree_insert_keys(b, op))
+                       bch_btree_write(b, false, op);
+       }
+
+       return 0;
+}
+
+int bch_btree_insert(struct btree_op *op, struct cache_set *c)
+{
+       int ret = 0;
+       struct keylist stack_keys;
+
+       /*
+        * Don't want to block with the btree locked unless we have to,
+        * otherwise we get deadlocks with try_harder and between split/gc
+        */
+       clear_closure_blocking(&op->cl);
+
+       BUG_ON(bch_keylist_empty(&op->keys));
+       bch_keylist_copy(&stack_keys, &op->keys);
+       bch_keylist_init(&op->keys);
+
+       while (!bch_keylist_empty(&stack_keys) ||
+              !bch_keylist_empty(&op->keys)) {
+               if (bch_keylist_empty(&op->keys)) {
+                       bch_keylist_add(&op->keys,
+                                       bch_keylist_pop(&stack_keys));
+                       op->lock = 0;
+               }
+
+               ret = btree_root(insert_recurse, c, op, &stack_keys);
+
+               if (ret == -EAGAIN) {
+                       ret = 0;
+                       closure_sync(&op->cl);
+               } else if (ret) {
+                       struct bkey *k;
+
+                       pr_err("error %i trying to insert key for %s",
+                              ret, op_type(op));
+
+                       while ((k = bch_keylist_pop(&stack_keys) ?:
+                                   bch_keylist_pop(&op->keys)))
+                               bkey_put(c, k, 0);
+               }
+       }
+
+       bch_keylist_free(&stack_keys);
+
+       if (op->journal)
+               atomic_dec_bug(op->journal);
+       op->journal = NULL;
+       return ret;
+}
+
+void bch_btree_set_root(struct btree *b)
+{
+       unsigned i;
+
+       BUG_ON(!b->written);
+
+       for (i = 0; i < KEY_PTRS(&b->key); i++)
+               BUG_ON(PTR_BUCKET(b->c, &b->key, i)->prio != BTREE_PRIO);
+
+       mutex_lock(&b->c->bucket_lock);
+       list_del_init(&b->list);
+       mutex_unlock(&b->c->bucket_lock);
+
+       b->c->root = b;
+       __bkey_put(b->c, &b->key);
+
+       bch_journal_meta(b->c, NULL);
+       pr_debug("%s for %pf", pbtree(b), __builtin_return_address(0));
+}
+
+/* Cache lookup */
+
+static int submit_partial_cache_miss(struct btree *b, struct btree_op *op,
+                                    struct bkey *k)
+{
+       struct search *s = container_of(op, struct search, op);
+       struct bio *bio = &s->bio.bio;
+       int ret = 0;
+
+       while (!ret &&
+              !op->lookup_done) {
+               unsigned sectors = INT_MAX;
+
+               if (KEY_INODE(k) == op->inode) {
+                       if (KEY_START(k) <= bio->bi_sector)
+                               break;
+
+                       sectors = min_t(uint64_t, sectors,
+                                       KEY_START(k) - bio->bi_sector);
+               }
+
+               ret = s->d->cache_miss(b, s, bio, sectors);
+       }
+
+       return ret;
+}
+
+/*
+ * Read from a single key, handling the initial cache miss if the key starts in
+ * the middle of the bio
+ */
+static int submit_partial_cache_hit(struct btree *b, struct btree_op *op,
+                                   struct bkey *k)
+{
+       struct search *s = container_of(op, struct search, op);
+       struct bio *bio = &s->bio.bio;
+       unsigned ptr;
+       struct bio *n;
+
+       int ret = submit_partial_cache_miss(b, op, k);
+       if (ret || op->lookup_done)
+               return ret;
+
+       /* XXX: figure out best pointer - for multiple cache devices */
+       ptr = 0;
+
+       PTR_BUCKET(b->c, k, ptr)->prio = INITIAL_PRIO;
+
+       while (!op->lookup_done &&
+              KEY_INODE(k) == op->inode &&
+              bio->bi_sector < KEY_OFFSET(k)) {
+               struct bkey *bio_key;
+               sector_t sector = PTR_OFFSET(k, ptr) +
+                       (bio->bi_sector - KEY_START(k));
+               unsigned sectors = min_t(uint64_t, INT_MAX,
+                                        KEY_OFFSET(k) - bio->bi_sector);
+
+               n = bch_bio_split(bio, sectors, GFP_NOIO, s->d->bio_split);
+               if (!n)
+                       return -EAGAIN;
+
+               if (n == bio)
+                       op->lookup_done = true;
+
+               bio_key = &container_of(n, struct bbio, bio)->key;
+
+               /*
+                * The bucket we're reading from might be reused while our bio
+                * is in flight, and we could then end up reading the wrong
+                * data.
+                *
+                * We guard against this by checking (in cache_read_endio()) if
+                * the pointer is stale again; if so, we treat it as an error
+                * and reread from the backing device (but we don't pass that
+                * error up anywhere).
+                */
+
+               bch_bkey_copy_single_ptr(bio_key, k, ptr);
+               SET_PTR_OFFSET(bio_key, 0, sector);
+
+               n->bi_end_io    = bch_cache_read_endio;
+               n->bi_private   = &s->cl;
+
+               trace_bcache_cache_hit(n);
+               __bch_submit_bbio(n, b->c);
+       }
+
+       return 0;
+}
+
+int bch_btree_search_recurse(struct btree *b, struct btree_op *op)
+{
+       struct search *s = container_of(op, struct search, op);
+       struct bio *bio = &s->bio.bio;
+
+       int ret = 0;
+       struct bkey *k;
+       struct btree_iter iter;
+       bch_btree_iter_init(b, &iter, &KEY(op->inode, bio->bi_sector, 0));
+
+       pr_debug("at %s searching for %u:%llu", pbtree(b), op->inode,
+                (uint64_t) bio->bi_sector);
+
+       do {
+               k = bch_btree_iter_next_filter(&iter, b, bch_ptr_bad);
+               if (!k) {
+                       /*
+                        * b->key would be exactly what we want, except that
+                        * pointers to btree nodes have nonzero size - we
+                        * wouldn't go far enough
+                        */
+
+                       ret = submit_partial_cache_miss(b, op,
+                                       &KEY(KEY_INODE(&b->key),
+                                            KEY_OFFSET(&b->key), 0));
+                       break;
+               }
+
+               ret = b->level
+                       ? btree(search_recurse, k, b, op)
+                       : submit_partial_cache_hit(b, op, k);
+       } while (!ret &&
+                !op->lookup_done);
+
+       return ret;
+}
+
+/* Keybuf code */
+
+static inline int keybuf_cmp(struct keybuf_key *l, struct keybuf_key *r)
+{
+       /* Overlapping keys compare equal */
+       if (bkey_cmp(&l->key, &START_KEY(&r->key)) <= 0)
+               return -1;
+       if (bkey_cmp(&START_KEY(&l->key), &r->key) >= 0)
+               return 1;
+       return 0;
+}
+
+static inline int keybuf_nonoverlapping_cmp(struct keybuf_key *l,
+                                           struct keybuf_key *r)
+{
+       return clamp_t(int64_t, bkey_cmp(&l->key, &r->key), -1, 1);
+}
+
+static int bch_btree_refill_keybuf(struct btree *b, struct btree_op *op,
+                                  struct keybuf *buf, struct bkey *end)
+{
+       struct btree_iter iter;
+       bch_btree_iter_init(b, &iter, &buf->last_scanned);
+
+       while (!array_freelist_empty(&buf->freelist)) {
+               struct bkey *k = bch_btree_iter_next_filter(&iter, b,
+                                                           bch_ptr_bad);
+
+               if (!b->level) {
+                       if (!k) {
+                               buf->last_scanned = b->key;
+                               break;
+                       }
+
+                       buf->last_scanned = *k;
+                       if (bkey_cmp(&buf->last_scanned, end) >= 0)
+                               break;
+
+                       if (buf->key_predicate(buf, k)) {
+                               struct keybuf_key *w;
+
+                               pr_debug("%s", pkey(k));
+
+                               spin_lock(&buf->lock);
+
+                               w = array_alloc(&buf->freelist);
+
+                               w->private = NULL;
+                               bkey_copy(&w->key, k);
+
+                               if (RB_INSERT(&buf->keys, w, node, keybuf_cmp))
+                                       array_free(&buf->freelist, w);
+
+                               spin_unlock(&buf->lock);
+                       }
+               } else {
+                       if (!k)
+                               break;
+
+                       btree(refill_keybuf, k, b, op, buf, end);
+                       /*
+                        * Might get an error here, but can't really do anything
+                        * and it'll get logged elsewhere. Just read what we
+                        * can.
+                        */
+
+                       if (bkey_cmp(&buf->last_scanned, end) >= 0)
+                               break;
+
+                       cond_resched();
+               }
+       }
+
+       return 0;
+}
+
+void bch_refill_keybuf(struct cache_set *c, struct keybuf *buf,
+                         struct bkey *end)
+{
+       struct bkey start = buf->last_scanned;
+       struct btree_op op;
+       bch_btree_op_init_stack(&op);
+
+       cond_resched();
+
+       btree_root(refill_keybuf, c, &op, buf, end);
+       closure_sync(&op.cl);
+
+       pr_debug("found %s keys from %llu:%llu to %llu:%llu",
+                RB_EMPTY_ROOT(&buf->keys) ? "no" :
+                array_freelist_empty(&buf->freelist) ? "some" : "a few",
+                KEY_INODE(&start), KEY_OFFSET(&start),
+                KEY_INODE(&buf->last_scanned), KEY_OFFSET(&buf->last_scanned));
+
+       spin_lock(&buf->lock);
+
+       if (!RB_EMPTY_ROOT(&buf->keys)) {
+               struct keybuf_key *w;
+               w = RB_FIRST(&buf->keys, struct keybuf_key, node);
+               buf->start      = START_KEY(&w->key);
+
+               w = RB_LAST(&buf->keys, struct keybuf_key, node);
+               buf->end        = w->key;
+       } else {
+               buf->start      = MAX_KEY;
+               buf->end        = MAX_KEY;
+       }
+
+       spin_unlock(&buf->lock);
+}
+
+static void __bch_keybuf_del(struct keybuf *buf, struct keybuf_key *w)
+{
+       rb_erase(&w->node, &buf->keys);
+       array_free(&buf->freelist, w);
+}
+
+void bch_keybuf_del(struct keybuf *buf, struct keybuf_key *w)
+{
+       spin_lock(&buf->lock);
+       __bch_keybuf_del(buf, w);
+       spin_unlock(&buf->lock);
+}
+
+bool bch_keybuf_check_overlapping(struct keybuf *buf, struct bkey *start,
+                                 struct bkey *end)
+{
+       bool ret = false;
+       struct keybuf_key *p, *w, s;
+       s.key = *start;
+
+       if (bkey_cmp(end, &buf->start) <= 0 ||
+           bkey_cmp(start, &buf->end) >= 0)
+               return false;
+
+       spin_lock(&buf->lock);
+       w = RB_GREATER(&buf->keys, s, node, keybuf_nonoverlapping_cmp);
+
+       while (w && bkey_cmp(&START_KEY(&w->key), end) < 0) {
+               p = w;
+               w = RB_NEXT(w, node);
+
+               if (p->private)
+                       ret = true;
+               else
+                       __bch_keybuf_del(buf, p);
+       }
+
+       spin_unlock(&buf->lock);
+       return ret;
+}
+
+struct keybuf_key *bch_keybuf_next(struct keybuf *buf)
+{
+       struct keybuf_key *w;
+       spin_lock(&buf->lock);
+
+       w = RB_FIRST(&buf->keys, struct keybuf_key, node);
+
+       while (w && w->private)
+               w = RB_NEXT(w, node);
+
+       if (w)
+               w->private = ERR_PTR(-EINTR);
+
+       spin_unlock(&buf->lock);
+       return w;
+}
+
+struct keybuf_key *bch_keybuf_next_rescan(struct cache_set *c,
+                                            struct keybuf *buf,
+                                            struct bkey *end)
+{
+       struct keybuf_key *ret;
+
+       while (1) {
+               ret = bch_keybuf_next(buf);
+               if (ret)
+                       break;
+
+               if (bkey_cmp(&buf->last_scanned, end) >= 0) {
+                       pr_debug("scan finished");
+                       break;
+               }
+
+               bch_refill_keybuf(c, buf, end);
+       }
+
+       return ret;
+}
+
+void bch_keybuf_init(struct keybuf *buf, keybuf_pred_fn *fn)
+{
+       buf->key_predicate      = fn;
+       buf->last_scanned       = MAX_KEY;
+       buf->keys               = RB_ROOT;
+
+       spin_lock_init(&buf->lock);
+       array_allocator_init(&buf->freelist);
+}
+
+void bch_btree_exit(void)
+{
+       if (btree_io_wq)
+               destroy_workqueue(btree_io_wq);
+       if (bch_gc_wq)
+               destroy_workqueue(bch_gc_wq);
+}
+
+int __init bch_btree_init(void)
+{
+       if (!(bch_gc_wq = create_singlethread_workqueue("bch_btree_gc")) ||
+           !(btree_io_wq = create_singlethread_workqueue("bch_btree_io")))
+               return -ENOMEM;
+
+       return 0;
+}
diff --git a/drivers/md/bcache/btree.h b/drivers/md/bcache/btree.h

new file mode 100644 (file)

index 0000000..af4a709
--- /dev/null
+++ b/drivers/md/bcache/btree.h
@@ -0,0 +1,405 @@
+#ifndef _BCACHE_BTREE_H
+#define _BCACHE_BTREE_H
+
+/*
+ * THE BTREE:
+ *
+ * At a high level, bcache's btree is relatively standard b+ tree. All keys and
+ * pointers are in the leaves; interior nodes only have pointers to the child
+ * nodes.
+ *
+ * In the interior nodes, a struct bkey always points to a child btree node, and
+ * the key is the highest key in the child node - except that the highest key in
+ * an interior node is always MAX_KEY. The size field refers to the size on disk
+ * of the child node - this would allow us to have variable sized btree nodes
+ * (handy for keeping the depth of the btree 1 by expanding just the root).
+ *
+ * Btree nodes are themselves log structured, but this is hidden fairly
+ * thoroughly. Btree nodes on disk will in practice have extents that overlap
+ * (because they were written at different times), but in memory we never have
+ * overlapping extents - when we read in a btree node from disk, the first thing
+ * we do is resort all the sets of keys with a mergesort, and in the same pass
+ * we check for overlapping extents and adjust them appropriately.
+ *
+ * struct btree_op is a central interface to the btree code. It's used for
+ * specifying read vs. write locking, and the embedded closure is used for
+ * waiting on IO or reserve memory.
+ *
+ * BTREE CACHE:
+ *
+ * Btree nodes are cached in memory; traversing the btree might require reading
+ * in btree nodes which is handled mostly transparently.
+ *
+ * bch_btree_node_get() looks up a btree node in the cache and reads it in from
+ * disk if necessary. This function is almost never called directly though - the
+ * btree() macro is used to get a btree node, call some function on it, and
+ * unlock the node after the function returns.
+ *
+ * The root is special cased - it's taken out of the cache's lru (thus pinning
+ * it in memory), so we can find the root of the btree by just dereferencing a
+ * pointer instead of looking it up in the cache. This makes locking a bit
+ * tricky, since the root pointer is protected by the lock in the btree node it
+ * points to - the btree_root() macro handles this.
+ *
+ * In various places we must be able to allocate memory for multiple btree nodes
+ * in order to make forward progress. To do this we use the btree cache itself
+ * as a reserve; if __get_free_pages() fails, we'll find a node in the btree
+ * cache we can reuse. We can't allow more than one thread to be doing this at a
+ * time, so there's a lock, implemented by a pointer to the btree_op closure -
+ * this allows the btree_root() macro to implicitly release this lock.
+ *
+ * BTREE IO:
+ *
+ * Btree nodes never have to be explicitly read in; bch_btree_node_get() handles
+ * this.
+ *
+ * For writing, we have two btree_write structs embeddded in struct btree - one
+ * write in flight, and one being set up, and we toggle between them.
+ *
+ * Writing is done with a single function -  bch_btree_write() really serves two
+ * different purposes and should be broken up into two different functions. When
+ * passing now = false, it merely indicates that the node is now dirty - calling
+ * it ensures that the dirty keys will be written at some point in the future.
+ *
+ * When passing now = true, bch_btree_write() causes a write to happen
+ * "immediately" (if there was already a write in flight, it'll cause the write
+ * to happen as soon as the previous write completes). It returns immediately
+ * though - but it takes a refcount on the closure in struct btree_op you passed
+ * to it, so a closure_sync() later can be used to wait for the write to
+ * complete.
+ *
+ * This is handy because btree_split() and garbage collection can issue writes
+ * in parallel, reducing the amount of time they have to hold write locks.
+ *
+ * LOCKING:
+ *
+ * When traversing the btree, we may need write locks starting at some level -
+ * inserting a key into the btree will typically only require a write lock on
+ * the leaf node.
+ *
+ * This is specified with the lock field in struct btree_op; lock = 0 means we
+ * take write locks at level <= 0, i.e. only leaf nodes. bch_btree_node_get()
+ * checks this field and returns the node with the appropriate lock held.
+ *
+ * If, after traversing the btree, the insertion code discovers it has to split
+ * then it must restart from the root and take new locks - to do this it changes
+ * the lock field and returns -EINTR, which causes the btree_root() macro to
+ * loop.
+ *
+ * Handling cache misses require a different mechanism for upgrading to a write
+ * lock. We do cache lookups with only a read lock held, but if we get a cache
+ * miss and we wish to insert this data into the cache, we have to insert a
+ * placeholder key to detect races - otherwise, we could race with a write and
+ * overwrite the data that was just written to the cache with stale data from
+ * the backing device.
+ *
+ * For this we use a sequence number that write locks and unlocks increment - to
+ * insert the check key it unlocks the btree node and then takes a write lock,
+ * and fails if the sequence number doesn't match.
+ */
+
+#include "bset.h"
+#include "debug.h"
+
+struct btree_write {
+       struct closure          *owner;
+       atomic_t                *journal;
+
+       /* If btree_split() frees a btree node, it writes a new pointer to that
+        * btree node indicating it was freed; it takes a refcount on
+        * c->prio_blocked because we can't write the gens until the new
+        * pointer is on disk. This allows btree_write_endio() to release the
+        * refcount that btree_split() took.
+        */
+       int                     prio_blocked;
+};
+
+struct btree {
+       /* Hottest entries first */
+       struct hlist_node       hash;
+
+       /* Key/pointer for this btree node */
+       BKEY_PADDED(key);
+
+       /* Single bit - set when accessed, cleared by shrinker */
+       unsigned long           accessed;
+       unsigned long           seq;
+       struct rw_semaphore     lock;
+       struct cache_set        *c;
+
+       unsigned long           flags;
+       uint16_t                written;        /* would be nice to kill */
+       uint8_t                 level;
+       uint8_t                 nsets;
+       uint8_t                 page_order;
+
+       /*
+        * Set of sorted keys - the real btree node - plus a binary search tree
+        *
+        * sets[0] is special; set[0]->tree, set[0]->prev and set[0]->data point
+        * to the memory we have allocated for this btree node. Additionally,
+        * set[0]->data points to the entire btree node as it exists on disk.
+        */
+       struct bset_tree        sets[MAX_BSETS];
+
+       /* Used to refcount bio splits, also protects b->bio */
+       struct closure_with_waitlist    io;
+
+       /* Gets transferred to w->prio_blocked - see the comment there */
+       int                     prio_blocked;
+
+       struct list_head        list;
+       struct delayed_work     work;
+
+       uint64_t                io_start_time;
+       struct btree_write      writes[2];
+       struct bio              *bio;
+};
+
+#define BTREE_FLAG(flag)                                               \
+static inline bool btree_node_ ## flag(struct btree *b)                        \
+{      return test_bit(BTREE_NODE_ ## flag, &b->flags); }              \
+                                                                       \
+static inline void set_btree_node_ ## flag(struct btree *b)            \
+{      set_bit(BTREE_NODE_ ## flag, &b->flags); }                      \
+
+enum btree_flags {
+       BTREE_NODE_read_done,
+       BTREE_NODE_io_error,
+       BTREE_NODE_dirty,
+       BTREE_NODE_write_idx,
+};
+
+BTREE_FLAG(read_done);
+BTREE_FLAG(io_error);
+BTREE_FLAG(dirty);
+BTREE_FLAG(write_idx);
+
+static inline struct btree_write *btree_current_write(struct btree *b)
+{
+       return b->writes + btree_node_write_idx(b);
+}
+
+static inline struct btree_write *btree_prev_write(struct btree *b)
+{
+       return b->writes + (btree_node_write_idx(b) ^ 1);
+}
+
+static inline unsigned bset_offset(struct btree *b, struct bset *i)
+{
+       return (((size_t) i) - ((size_t) b->sets->data)) >> 9;
+}
+
+static inline struct bset *write_block(struct btree *b)
+{
+       return ((void *) b->sets[0].data) + b->written * block_bytes(b->c);
+}
+
+static inline bool bset_written(struct btree *b, struct bset_tree *t)
+{
+       return t->data < write_block(b);
+}
+
+static inline bool bkey_written(struct btree *b, struct bkey *k)
+{
+       return k < write_block(b)->start;
+}
+
+static inline void set_gc_sectors(struct cache_set *c)
+{
+       atomic_set(&c->sectors_to_gc, c->sb.bucket_size * c->nbuckets / 8);
+}
+
+static inline bool bch_ptr_invalid(struct btree *b, const struct bkey *k)
+{
+       return __bch_ptr_invalid(b->c, b->level, k);
+}
+
+static inline struct bkey *bch_btree_iter_init(struct btree *b,
+                                              struct btree_iter *iter,
+                                              struct bkey *search)
+{
+       return __bch_btree_iter_init(b, iter, search, b->sets);
+}
+
+/* Looping macros */
+
+#define for_each_cached_btree(b, c, iter)                              \
+       for (iter = 0;                                                  \
+            iter < ARRAY_SIZE((c)->bucket_hash);                       \
+            iter++)                                                    \
+               hlist_for_each_entry_rcu((b), (c)->bucket_hash + iter, hash)
+
+#define for_each_key_filter(b, k, iter, filter)                                \
+       for (bch_btree_iter_init((b), (iter), NULL);                    \
+            ((k) = bch_btree_iter_next_filter((iter), b, filter));)
+
+#define for_each_key(b, k, iter)                                       \
+       for (bch_btree_iter_init((b), (iter), NULL);                    \
+            ((k) = bch_btree_iter_next(iter));)
+
+/* Recursing down the btree */
+
+struct btree_op {
+       struct closure          cl;
+       struct cache_set        *c;
+
+       /* Journal entry we have a refcount on */
+       atomic_t                *journal;
+
+       /* Bio to be inserted into the cache */
+       struct bio              *cache_bio;
+
+       unsigned                inode;
+
+       uint16_t                write_prio;
+
+       /* Btree level at which we start taking write locks */
+       short                   lock;
+
+       /* Btree insertion type */
+       enum {
+               BTREE_INSERT,
+               BTREE_REPLACE
+       } type:8;
+
+       unsigned                csum:1;
+       unsigned                skip:1;
+       unsigned                flush_journal:1;
+
+       unsigned                insert_data_done:1;
+       unsigned                lookup_done:1;
+       unsigned                insert_collision:1;
+
+       /* Anything after this point won't get zeroed in do_bio_hook() */
+
+       /* Keys to be inserted */
+       struct keylist          keys;
+       BKEY_PADDED(replace);
+};
+
+void bch_btree_op_init_stack(struct btree_op *);
+
+static inline void rw_lock(bool w, struct btree *b, int level)
+{
+       w ? down_write_nested(&b->lock, level + 1)
+         : down_read_nested(&b->lock, level + 1);
+       if (w)
+               b->seq++;
+}
+
+static inline void rw_unlock(bool w, struct btree *b)
+{
+#ifdef CONFIG_BCACHE_EDEBUG
+       unsigned i;
+
+       if (w &&
+           b->key.ptr[0] &&
+           btree_node_read_done(b))
+               for (i = 0; i <= b->nsets; i++)
+                       bch_check_key_order(b, b->sets[i].data);
+#endif
+
+       if (w)
+               b->seq++;
+       (w ? up_write : up_read)(&b->lock);
+}
+
+#define insert_lock(s, b)      ((b)->level <= (s)->lock)
+
+/*
+ * These macros are for recursing down the btree - they handle the details of
+ * locking and looking up nodes in the cache for you. They're best treated as
+ * mere syntax when reading code that uses them.
+ *
+ * op->lock determines whether we take a read or a write lock at a given depth.
+ * If you've got a read lock and find that you need a write lock (i.e. you're
+ * going to have to split), set op->lock and return -EINTR; btree_root() will
+ * call you again and you'll have the correct lock.
+ */
+
+/**
+ * btree - recurse down the btree on a specified key
+ * @fn:                function to call, which will be passed the child node
+ * @key:       key to recurse on
+ * @b:         parent btree node
+ * @op:                pointer to struct btree_op
+ */
+#define btree(fn, key, b, op, ...)                                     \
+({                                                                     \
+       int _r, l = (b)->level - 1;                                     \
+       bool _w = l <= (op)->lock;                                      \
+       struct btree *_b = bch_btree_node_get((b)->c, key, l, op);      \
+       if (!IS_ERR(_b)) {                                              \
+               _r = bch_btree_ ## fn(_b, op, ##__VA_ARGS__);           \
+               rw_unlock(_w, _b);                                      \
+       } else                                                          \
+               _r = PTR_ERR(_b);                                       \
+       _r;                                                             \
+})
+
+/**
+ * btree_root - call a function on the root of the btree
+ * @fn:                function to call, which will be passed the child node
+ * @c:         cache set
+ * @op:                pointer to struct btree_op
+ */
+#define btree_root(fn, c, op, ...)                                     \
+({                                                                     \
+       int _r = -EINTR;                                                \
+       do {                                                            \
+               struct btree *_b = (c)->root;                           \
+               bool _w = insert_lock(op, _b);                          \
+               rw_lock(_w, _b, _b->level);                             \
+               if (_b == (c)->root &&                                  \
+                   _w == insert_lock(op, _b))                          \
+                       _r = bch_btree_ ## fn(_b, op, ##__VA_ARGS__);   \
+               rw_unlock(_w, _b);                                      \
+               bch_cannibalize_unlock(c, &(op)->cl);           \
+       } while (_r == -EINTR);                                         \
+                                                                       \
+       _r;                                                             \
+})
+
+static inline bool should_split(struct btree *b)
+{
+       struct bset *i = write_block(b);
+       return b->written >= btree_blocks(b) ||
+               (i->seq == b->sets[0].data->seq &&
+                b->written + __set_blocks(i, i->keys + 15, b->c)
+                > btree_blocks(b));
+}
+
+void bch_btree_read_done(struct closure *);
+void bch_btree_read(struct btree *);
+void bch_btree_write(struct btree *b, bool now, struct btree_op *op);
+
+void bch_cannibalize_unlock(struct cache_set *, struct closure *);
+void bch_btree_set_root(struct btree *);
+struct btree *bch_btree_node_alloc(struct cache_set *, int, struct closure *);
+struct btree *bch_btree_node_get(struct cache_set *, struct bkey *,
+                               int, struct btree_op *);
+
+bool bch_btree_insert_keys(struct btree *, struct btree_op *);
+bool bch_btree_insert_check_key(struct btree *, struct btree_op *,
+                                  struct bio *);
+int bch_btree_insert(struct btree_op *, struct cache_set *);
+
+int bch_btree_search_recurse(struct btree *, struct btree_op *);
+
+void bch_queue_gc(struct cache_set *);
+size_t bch_btree_gc_finish(struct cache_set *);
+void bch_moving_gc(struct closure *);
+int bch_btree_check(struct cache_set *, struct btree_op *);
+uint8_t __bch_btree_mark_key(struct cache_set *, int, struct bkey *);
+
+void bch_keybuf_init(struct keybuf *, keybuf_pred_fn *);
+void bch_refill_keybuf(struct cache_set *, struct keybuf *, struct bkey *);
+bool bch_keybuf_check_overlapping(struct keybuf *, struct bkey *,
+                                 struct bkey *);
+void bch_keybuf_del(struct keybuf *, struct keybuf_key *);
+struct keybuf_key *bch_keybuf_next(struct keybuf *);
+struct keybuf_key *bch_keybuf_next_rescan(struct cache_set *,
+                                         struct keybuf *, struct bkey *);
+
+#endif
diff --git a/drivers/md/bcache/closure.c b/drivers/md/bcache/closure.c

new file mode 100644 (file)

index 0000000..bd05a9a
--- /dev/null
+++ b/drivers/md/bcache/closure.c
@@ -0,0 +1,345 @@
+/*
+ * Asynchronous refcounty things
+ *
+ * Copyright 2010, 2011 Kent Overstreet <kent.overstreet@gmail.com>
+ * Copyright 2012 Google, Inc.
+ */
+
+#include <linux/debugfs.h>
+#include <linux/module.h>
+#include <linux/seq_file.h>
+
+#include "closure.h"
+
+void closure_queue(struct closure *cl)
+{
+       struct workqueue_struct *wq = cl->wq;
+       if (wq) {
+               INIT_WORK(&cl->work, cl->work.func);
+               BUG_ON(!queue_work(wq, &cl->work));
+       } else
+               cl->fn(cl);
+}
+EXPORT_SYMBOL_GPL(closure_queue);
+
+#define CL_FIELD(type, field)                                  \
+       case TYPE_ ## type:                                     \
+       return &container_of(cl, struct type, cl)->field
+
+static struct closure_waitlist *closure_waitlist(struct closure *cl)
+{
+       switch (cl->type) {
+               CL_FIELD(closure_with_waitlist, wait);
+               CL_FIELD(closure_with_waitlist_and_timer, wait);
+       default:
+               return NULL;
+       }
+}
+
+static struct timer_list *closure_timer(struct closure *cl)
+{
+       switch (cl->type) {
+               CL_FIELD(closure_with_timer, timer);
+               CL_FIELD(closure_with_waitlist_and_timer, timer);
+       default:
+               return NULL;
+       }
+}
+
+static inline void closure_put_after_sub(struct closure *cl, int flags)
+{
+       int r = flags & CLOSURE_REMAINING_MASK;
+
+       BUG_ON(flags & CLOSURE_GUARD_MASK);
+       BUG_ON(!r && (flags & ~(CLOSURE_DESTRUCTOR|CLOSURE_BLOCKING)));
+
+       /* Must deliver precisely one wakeup */
+       if (r == 1 && (flags & CLOSURE_SLEEPING))
+               wake_up_process(cl->task);
+
+       if (!r) {
+               if (cl->fn && !(flags & CLOSURE_DESTRUCTOR)) {
+                       /* CLOSURE_BLOCKING might be set - clear it */
+                       atomic_set(&cl->remaining,
+                                  CLOSURE_REMAINING_INITIALIZER);
+                       closure_queue(cl);
+               } else {
+                       struct closure *parent = cl->parent;
+                       struct closure_waitlist *wait = closure_waitlist(cl);
+
+                       closure_debug_destroy(cl);
+
+                       atomic_set(&cl->remaining, -1);
+
+                       if (wait)
+                               closure_wake_up(wait);
+
+                       if (cl->fn)
+                               cl->fn(cl);
+
+                       if (parent)
+                               closure_put(parent);
+               }
+       }
+}
+
+/* For clearing flags with the same atomic op as a put */
+void closure_sub(struct closure *cl, int v)
+{
+       closure_put_after_sub(cl, atomic_sub_return(v, &cl->remaining));
+}
+EXPORT_SYMBOL_GPL(closure_sub);
+
+void closure_put(struct closure *cl)
+{
+       closure_put_after_sub(cl, atomic_dec_return(&cl->remaining));
+}
+EXPORT_SYMBOL_GPL(closure_put);
+
+static void set_waiting(struct closure *cl, unsigned long f)
+{
+#ifdef CONFIG_BCACHE_CLOSURES_DEBUG
+       cl->waiting_on = f;
+#endif
+}
+
+void __closure_wake_up(struct closure_waitlist *wait_list)
+{
+       struct llist_node *list;
+       struct closure *cl;
+       struct llist_node *reverse = NULL;
+
+       list = llist_del_all(&wait_list->list);
+
+       /* We first reverse the list to preserve FIFO ordering and fairness */
+
+       while (list) {
+               struct llist_node *t = list;
+               list = llist_next(list);
+
+               t->next = reverse;
+               reverse = t;
+       }
+
+       /* Then do the wakeups */
+
+       while (reverse) {
+               cl = container_of(reverse, struct closure, list);
+               reverse = llist_next(reverse);
+
+               set_waiting(cl, 0);
+               closure_sub(cl, CLOSURE_WAITING + 1);
+       }
+}
+EXPORT_SYMBOL_GPL(__closure_wake_up);
+
+bool closure_wait(struct closure_waitlist *list, struct closure *cl)
+{
+       if (atomic_read(&cl->remaining) & CLOSURE_WAITING)
+               return false;
+
+       set_waiting(cl, _RET_IP_);
+       atomic_add(CLOSURE_WAITING + 1, &cl->remaining);
+       llist_add(&cl->list, &list->list);
+
+       return true;
+}
+EXPORT_SYMBOL_GPL(closure_wait);
+
+/**
+ * closure_sync() - sleep until a closure a closure has nothing left to wait on
+ *
+ * Sleeps until the refcount hits 1 - the thread that's running the closure owns
+ * the last refcount.
+ */
+void closure_sync(struct closure *cl)
+{
+       while (1) {
+               __closure_start_sleep(cl);
+               closure_set_ret_ip(cl);
+
+               if ((atomic_read(&cl->remaining) &
+                    CLOSURE_REMAINING_MASK) == 1)
+                       break;
+
+               schedule();
+       }
+
+       __closure_end_sleep(cl);
+}
+EXPORT_SYMBOL_GPL(closure_sync);
+
+/**
+ * closure_trylock() - try to acquire the closure, without waiting
+ * @cl:                closure to lock
+ *
+ * Returns true if the closure was succesfully locked.
+ */
+bool closure_trylock(struct closure *cl, struct closure *parent)
+{
+       if (atomic_cmpxchg(&cl->remaining, -1,
+                          CLOSURE_REMAINING_INITIALIZER) != -1)
+               return false;
+
+       closure_set_ret_ip(cl);
+
+       smp_mb();
+       cl->parent = parent;
+       if (parent)
+               closure_get(parent);
+
+       closure_debug_create(cl);
+       return true;
+}
+EXPORT_SYMBOL_GPL(closure_trylock);
+
+void __closure_lock(struct closure *cl, struct closure *parent,
+                   struct closure_waitlist *wait_list)
+{
+       struct closure wait;
+       closure_init_stack(&wait);
+
+       while (1) {
+               if (closure_trylock(cl, parent))
+                       return;
+
+               closure_wait_event_sync(wait_list, &wait,
+                                       atomic_read(&cl->remaining) == -1);
+       }
+}
+EXPORT_SYMBOL_GPL(__closure_lock);
+
+static void closure_delay_timer_fn(unsigned long data)
+{
+       struct closure *cl = (struct closure *) data;
+       closure_sub(cl, CLOSURE_TIMER + 1);
+}
+
+void do_closure_timer_init(struct closure *cl)
+{
+       struct timer_list *timer = closure_timer(cl);
+
+       init_timer(timer);
+       timer->data     = (unsigned long) cl;
+       timer->function = closure_delay_timer_fn;
+}
+EXPORT_SYMBOL_GPL(do_closure_timer_init);
+
+bool __closure_delay(struct closure *cl, unsigned long delay,
+                    struct timer_list *timer)
+{
+       if (atomic_read(&cl->remaining) & CLOSURE_TIMER)
+               return false;
+
+       BUG_ON(timer_pending(timer));
+
+       timer->expires  = jiffies + delay;
+
+       atomic_add(CLOSURE_TIMER + 1, &cl->remaining);
+       add_timer(timer);
+       return true;
+}
+EXPORT_SYMBOL_GPL(__closure_delay);
+
+void __closure_flush(struct closure *cl, struct timer_list *timer)
+{
+       if (del_timer(timer))
+               closure_sub(cl, CLOSURE_TIMER + 1);
+}
+EXPORT_SYMBOL_GPL(__closure_flush);
+
+void __closure_flush_sync(struct closure *cl, struct timer_list *timer)
+{
+       if (del_timer_sync(timer))
+               closure_sub(cl, CLOSURE_TIMER + 1);
+}
+EXPORT_SYMBOL_GPL(__closure_flush_sync);
+
+#ifdef CONFIG_BCACHE_CLOSURES_DEBUG
+
+static LIST_HEAD(closure_list);
+static DEFINE_SPINLOCK(closure_list_lock);
+
+void closure_debug_create(struct closure *cl)
+{
+       unsigned long flags;
+
+       BUG_ON(cl->magic == CLOSURE_MAGIC_ALIVE);
+       cl->magic = CLOSURE_MAGIC_ALIVE;
+
+       spin_lock_irqsave(&closure_list_lock, flags);
+       list_add(&cl->all, &closure_list);
+       spin_unlock_irqrestore(&closure_list_lock, flags);
+}
+EXPORT_SYMBOL_GPL(closure_debug_create);
+
+void closure_debug_destroy(struct closure *cl)
+{
+       unsigned long flags;
+
+       BUG_ON(cl->magic != CLOSURE_MAGIC_ALIVE);
+       cl->magic = CLOSURE_MAGIC_DEAD;
+
+       spin_lock_irqsave(&closure_list_lock, flags);
+       list_del(&cl->all);
+       spin_unlock_irqrestore(&closure_list_lock, flags);
+}
+EXPORT_SYMBOL_GPL(closure_debug_destroy);
+
+static struct dentry *debug;
+
+#define work_data_bits(work) ((unsigned long *)(&(work)->data))
+
+static int debug_seq_show(struct seq_file *f, void *data)
+{
+       struct closure *cl;
+       spin_lock_irq(&closure_list_lock);
+
+       list_for_each_entry(cl, &closure_list, all) {
+               int r = atomic_read(&cl->remaining);
+
+               seq_printf(f, "%p: %pF -> %pf p %p r %i ",
+                          cl, (void *) cl->ip, cl->fn, cl->parent,
+                          r & CLOSURE_REMAINING_MASK);
+
+               seq_printf(f, "%s%s%s%s%s%s\n",
+                          test_bit(WORK_STRUCT_PENDING,
+                                   work_data_bits(&cl->work)) ? "Q" : "",
+                          r & CLOSURE_RUNNING  ? "R" : "",
+                          r & CLOSURE_BLOCKING ? "B" : "",
+                          r & CLOSURE_STACK    ? "S" : "",
+                          r & CLOSURE_SLEEPING ? "Sl" : "",
+                          r & CLOSURE_TIMER    ? "T" : "");
+
+               if (r & CLOSURE_WAITING)
+                       seq_printf(f, " W %pF\n",
+                                  (void *) cl->waiting_on);
+
+               seq_printf(f, "\n");
+       }
+
+       spin_unlock_irq(&closure_list_lock);
+       return 0;
+}
+
+static int debug_seq_open(struct inode *inode, struct file *file)
+{
+       return single_open(file, debug_seq_show, NULL);
+}
+
+static const struct file_operations debug_ops = {
+       .owner          = THIS_MODULE,
+       .open           = debug_seq_open,
+       .read           = seq_read,
+       .release        = single_release
+};
+
+void __init closure_debug_init(void)
+{
+       debug = debugfs_create_file("closures", 0400, NULL, NULL, &debug_ops);
+}
+
+#endif
+
+MODULE_AUTHOR("Kent Overstreet <koverstreet@google.com>");
+MODULE_LICENSE("GPL");
diff --git a/drivers/md/bcache/closure.h b/drivers/md/bcache/closure.h

new file mode 100644 (file)

index 0000000..0003992
--- /dev/null
+++ b/drivers/md/bcache/closure.h
@@ -0,0 +1,672 @@
+#ifndef _LINUX_CLOSURE_H
+#define _LINUX_CLOSURE_H
+
+#include <linux/llist.h>
+#include <linux/sched.h>
+#include <linux/workqueue.h>
+
+/*
+ * Closure is perhaps the most overused and abused term in computer science, but
+ * since I've been unable to come up with anything better you're stuck with it
+ * again.
+ *
+ * What are closures?
+ *
+ * They embed a refcount. The basic idea is they count "things that are in
+ * progress" - in flight bios, some other thread that's doing something else -
+ * anything you might want to wait on.
+ *
+ * The refcount may be manipulated with closure_get() and closure_put().
+ * closure_put() is where many of the interesting things happen, when it causes
+ * the refcount to go to 0.
+ *
+ * Closures can be used to wait on things both synchronously and asynchronously,
+ * and synchronous and asynchronous use can be mixed without restriction. To
+ * wait synchronously, use closure_sync() - you will sleep until your closure's
+ * refcount hits 1.
+ *
+ * To wait asynchronously, use
+ *   continue_at(cl, next_function, workqueue);
+ *
+ * passing it, as you might expect, the function to run when nothing is pending
+ * and the workqueue to run that function out of.
+ *
+ * continue_at() also, critically, is a macro that returns the calling function.
+ * There's good reason for this.
+ *
+ * To use safely closures asynchronously, they must always have a refcount while
+ * they are running owned by the thread that is running them. Otherwise, suppose
+ * you submit some bios and wish to have a function run when they all complete:
+ *
+ * foo_endio(struct bio *bio, int error)
+ * {
+ *     closure_put(cl);
+ * }
+ *
+ * closure_init(cl);
+ *
+ * do_stuff();
+ * closure_get(cl);
+ * bio1->bi_endio = foo_endio;
+ * bio_submit(bio1);
+ *
+ * do_more_stuff();
+ * closure_get(cl);
+ * bio2->bi_endio = foo_endio;
+ * bio_submit(bio2);
+ *
+ * continue_at(cl, complete_some_read, system_wq);
+ *
+ * If closure's refcount started at 0, complete_some_read() could run before the
+ * second bio was submitted - which is almost always not what you want! More
+ * importantly, it wouldn't be possible to say whether the original thread or
+ * complete_some_read()'s thread owned the closure - and whatever state it was
+ * associated with!
+ *
+ * So, closure_init() initializes a closure's refcount to 1 - and when a
+ * closure_fn is run, the refcount will be reset to 1 first.
+ *
+ * Then, the rule is - if you got the refcount with closure_get(), release it
+ * with closure_put() (i.e, in a bio->bi_endio function). If you have a refcount
+ * on a closure because you called closure_init() or you were run out of a
+ * closure - _always_ use continue_at(). Doing so consistently will help
+ * eliminate an entire class of particularly pernicious races.
+ *
+ * For a closure to wait on an arbitrary event, we need to introduce waitlists:
+ *
+ * struct closure_waitlist list;
+ * closure_wait_event(list, cl, condition);
+ * closure_wake_up(wait_list);
+ *
+ * These work analagously to wait_event() and wake_up() - except that instead of
+ * operating on the current thread (for wait_event()) and lists of threads, they
+ * operate on an explicit closure and lists of closures.
+ *
+ * Because it's a closure we can now wait either synchronously or
+ * asynchronously. closure_wait_event() returns the current value of the
+ * condition, and if it returned false continue_at() or closure_sync() can be
+ * used to wait for it to become true.
+ *
+ * It's useful for waiting on things when you can't sleep in the context in
+ * which you must check the condition (perhaps a spinlock held, or you might be
+ * beneath generic_make_request() - in which case you can't sleep on IO).
+ *
+ * closure_wait_event() will wait either synchronously or asynchronously,
+ * depending on whether the closure is in blocking mode or not. You can pick a
+ * mode explicitly with closure_wait_event_sync() and
+ * closure_wait_event_async(), which do just what you might expect.
+ *
+ * Lastly, you might have a wait list dedicated to a specific event, and have no
+ * need for specifying the condition - you just want to wait until someone runs
+ * closure_wake_up() on the appropriate wait list. In that case, just use
+ * closure_wait(). It will return either true or false, depending on whether the
+ * closure was already on a wait list or not - a closure can only be on one wait
+ * list at a time.
+ *
+ * Parents:
+ *
+ * closure_init() takes two arguments - it takes the closure to initialize, and
+ * a (possibly null) parent.
+ *
+ * If parent is non null, the new closure will have a refcount for its lifetime;
+ * a closure is considered to be "finished" when its refcount hits 0 and the
+ * function to run is null. Hence
+ *
+ * continue_at(cl, NULL, NULL);
+ *
+ * returns up the (spaghetti) stack of closures, precisely like normal return
+ * returns up the C stack. continue_at() with non null fn is better thought of
+ * as doing a tail call.
+ *
+ * All this implies that a closure should typically be embedded in a particular
+ * struct (which its refcount will normally control the lifetime of), and that
+ * struct can very much be thought of as a stack frame.
+ *
+ * Locking:
+ *
+ * Closures are based on work items but they can be thought of as more like
+ * threads - in that like threads and unlike work items they have a well
+ * defined lifetime; they are created (with closure_init()) and eventually
+ * complete after a continue_at(cl, NULL, NULL).
+ *
+ * Suppose you've got some larger structure with a closure embedded in it that's
+ * used for periodically doing garbage collection. You only want one garbage
+ * collection happening at a time, so the natural thing to do is protect it with
+ * a lock. However, it's difficult to use a lock protecting a closure correctly
+ * because the unlock should come after the last continue_to() (additionally, if
+ * you're using the closure asynchronously a mutex won't work since a mutex has
+ * to be unlocked by the same process that locked it).
+ *
+ * So to make it less error prone and more efficient, we also have the ability
+ * to use closures as locks:
+ *
+ * closure_init_unlocked();
+ * closure_trylock();
+ *
+ * That's all we need for trylock() - the last closure_put() implicitly unlocks
+ * it for you.  But for closure_lock(), we also need a wait list:
+ *
+ * struct closure_with_waitlist frobnicator_cl;
+ *
+ * closure_init_unlocked(&frobnicator_cl);
+ * closure_lock(&frobnicator_cl);
+ *
+ * A closure_with_waitlist embeds a closure and a wait list - much like struct
+ * delayed_work embeds a work item and a timer_list. The important thing is, use
+ * it exactly like you would a regular closure and closure_put() will magically
+ * handle everything for you.
+ *
+ * We've got closures that embed timers, too. They're called, appropriately
+ * enough:
+ * struct closure_with_timer;
+ *
+ * This gives you access to closure_delay(). It takes a refcount for a specified
+ * number of jiffies - you could then call closure_sync() (for a slightly
+ * convoluted version of msleep()) or continue_at() - which gives you the same
+ * effect as using a delayed work item, except you can reuse the work_struct
+ * already embedded in struct closure.
+ *
+ * Lastly, there's struct closure_with_waitlist_and_timer. It does what you
+ * probably expect, if you happen to need the features of both. (You don't
+ * really want to know how all this is implemented, but if I've done my job
+ * right you shouldn't have to care).
+ */
+
+struct closure;
+typedef void (closure_fn) (struct closure *);
+
+struct closure_waitlist {
+       struct llist_head       list;
+};
+
+enum closure_type {
+       TYPE_closure                            = 0,
+       TYPE_closure_with_waitlist              = 1,
+       TYPE_closure_with_timer                 = 2,
+       TYPE_closure_with_waitlist_and_timer    = 3,
+       MAX_CLOSURE_TYPE                        = 3,
+};
+
+enum closure_state {
+       /*
+        * CLOSURE_BLOCKING: Causes closure_wait_event() to block, instead of
+        * waiting asynchronously
+        *
+        * CLOSURE_WAITING: Set iff the closure is on a waitlist. Must be set by
+        * the thread that owns the closure, and cleared by the thread that's
+        * waking up the closure.
+        *
+        * CLOSURE_SLEEPING: Must be set before a thread uses a closure to sleep
+        * - indicates that cl->task is valid and closure_put() may wake it up.
+        * Only set or cleared by the thread that owns the closure.
+        *
+        * CLOSURE_TIMER: Analagous to CLOSURE_WAITING, indicates that a closure
+        * has an outstanding timer. Must be set by the thread that owns the
+        * closure, and cleared by the timer function when the timer goes off.
+        *
+        * The rest are for debugging and don't affect behaviour:
+        *
+        * CLOSURE_RUNNING: Set when a closure is running (i.e. by
+        * closure_init() and when closure_put() runs then next function), and
+        * must be cleared before remaining hits 0. Primarily to help guard
+        * against incorrect usage and accidentally transferring references.
+        * continue_at() and closure_return() clear it for you, if you're doing
+        * something unusual you can use closure_set_dead() which also helps
+        * annotate where references are being transferred.
+        *
+        * CLOSURE_STACK: Sanity check - remaining should never hit 0 on a
+        * closure with this flag set
+        */
+
+       CLOSURE_BITS_START      = (1 << 19),
+       CLOSURE_DESTRUCTOR      = (1 << 19),
+       CLOSURE_BLOCKING        = (1 << 21),
+       CLOSURE_WAITING         = (1 << 23),
+       CLOSURE_SLEEPING        = (1 << 25),
+       CLOSURE_TIMER           = (1 << 27),
+       CLOSURE_RUNNING         = (1 << 29),
+       CLOSURE_STACK           = (1 << 31),
+};
+
+#define CLOSURE_GUARD_MASK                                     \
+       ((CLOSURE_DESTRUCTOR|CLOSURE_BLOCKING|CLOSURE_WAITING|  \
+         CLOSURE_SLEEPING|CLOSURE_TIMER|CLOSURE_RUNNING|CLOSURE_STACK) << 1)
+
+#define CLOSURE_REMAINING_MASK         (CLOSURE_BITS_START - 1)
+#define CLOSURE_REMAINING_INITIALIZER  (1|CLOSURE_RUNNING)
+
+struct closure {
+       union {
+               struct {
+                       struct workqueue_struct *wq;
+                       struct task_struct      *task;
+                       struct llist_node       list;
+                       closure_fn              *fn;
+               };
+               struct work_struct      work;
+       };
+
+       struct closure          *parent;
+
+       atomic_t                remaining;
+
+       enum closure_type       type;
+
+#ifdef CONFIG_BCACHE_CLOSURES_DEBUG
+#define CLOSURE_MAGIC_DEAD     0xc054dead
+#define CLOSURE_MAGIC_ALIVE    0xc054a11e
+
+       unsigned                magic;
+       struct list_head        all;
+       unsigned long           ip;
+       unsigned long           waiting_on;
+#endif
+};
+
+struct closure_with_waitlist {
+       struct closure          cl;
+       struct closure_waitlist wait;
+};
+
+struct closure_with_timer {
+       struct closure          cl;
+       struct timer_list       timer;
+};
+
+struct closure_with_waitlist_and_timer {
+       struct closure          cl;
+       struct closure_waitlist wait;
+       struct timer_list       timer;
+};
+
+extern unsigned invalid_closure_type(void);
+
+#define __CLOSURE_TYPE(cl, _t)                                         \
+         __builtin_types_compatible_p(typeof(cl), struct _t)           \
+               ? TYPE_ ## _t :                                         \
+
+#define __closure_type(cl)                                             \
+(                                                                      \
+       __CLOSURE_TYPE(cl, closure)                                     \
+       __CLOSURE_TYPE(cl, closure_with_waitlist)                       \
+       __CLOSURE_TYPE(cl, closure_with_timer)                          \
+       __CLOSURE_TYPE(cl, closure_with_waitlist_and_timer)             \
+       invalid_closure_type()                                          \
+)
+
+void closure_sub(struct closure *cl, int v);
+void closure_put(struct closure *cl);
+void closure_queue(struct closure *cl);
+void __closure_wake_up(struct closure_waitlist *list);
+bool closure_wait(struct closure_waitlist *list, struct closure *cl);
+void closure_sync(struct closure *cl);
+
+bool closure_trylock(struct closure *cl, struct closure *parent);
+void __closure_lock(struct closure *cl, struct closure *parent,
+                   struct closure_waitlist *wait_list);
+
+void do_closure_timer_init(struct closure *cl);
+bool __closure_delay(struct closure *cl, unsigned long delay,
+                    struct timer_list *timer);
+void __closure_flush(struct closure *cl, struct timer_list *timer);
+void __closure_flush_sync(struct closure *cl, struct timer_list *timer);
+
+#ifdef CONFIG_BCACHE_CLOSURES_DEBUG
+
+void closure_debug_init(void);
+void closure_debug_create(struct closure *cl);
+void closure_debug_destroy(struct closure *cl);
+
+#else
+
+static inline void closure_debug_init(void) {}
+static inline void closure_debug_create(struct closure *cl) {}
+static inline void closure_debug_destroy(struct closure *cl) {}
+
+#endif
+
+static inline void closure_set_ip(struct closure *cl)
+{
+#ifdef CONFIG_BCACHE_CLOSURES_DEBUG
+       cl->ip = _THIS_IP_;
+#endif
+}
+
+static inline void closure_set_ret_ip(struct closure *cl)
+{
+#ifdef CONFIG_BCACHE_CLOSURES_DEBUG
+       cl->ip = _RET_IP_;
+#endif
+}
+
+static inline void closure_get(struct closure *cl)
+{
+#ifdef CONFIG_BCACHE_CLOSURES_DEBUG
+       BUG_ON((atomic_inc_return(&cl->remaining) &
+               CLOSURE_REMAINING_MASK) <= 1);
+#else
+       atomic_inc(&cl->remaining);
+#endif
+}
+
+static inline void closure_set_stopped(struct closure *cl)
+{
+       atomic_sub(CLOSURE_RUNNING, &cl->remaining);
+}
+
+static inline bool closure_is_stopped(struct closure *cl)
+{
+       return !(atomic_read(&cl->remaining) & CLOSURE_RUNNING);
+}
+
+static inline bool closure_is_unlocked(struct closure *cl)
+{
+       return atomic_read(&cl->remaining) == -1;
+}
+
+static inline void do_closure_init(struct closure *cl, struct closure *parent,
+                                  bool running)
+{
+       switch (cl->type) {
+       case TYPE_closure_with_timer:
+       case TYPE_closure_with_waitlist_and_timer:
+               do_closure_timer_init(cl);
+       default:
+               break;
+       }
+
+       cl->parent = parent;
+       if (parent)
+               closure_get(parent);
+
+       if (running) {
+               closure_debug_create(cl);
+               atomic_set(&cl->remaining, CLOSURE_REMAINING_INITIALIZER);
+       } else
+               atomic_set(&cl->remaining, -1);
+
+       closure_set_ip(cl);
+}
+
+/*
+ * Hack to get at the embedded closure if there is one, by doing an unsafe cast:
+ * the result of __closure_type() is thrown away, it's used merely for type
+ * checking.
+ */
+#define __to_internal_closure(cl)                              \
+({                                                             \
+       BUILD_BUG_ON(__closure_type(*cl) > MAX_CLOSURE_TYPE);   \
+       (struct closure *) cl;                                  \
+})
+
+#define closure_init_type(cl, parent, running)                 \
+do {                                                           \
+       struct closure *_cl = __to_internal_closure(cl);        \
+       _cl->type = __closure_type(*(cl));                      \
+       do_closure_init(_cl, parent, running);                  \
+} while (0)
+
+/**
+ * __closure_init() - Initialize a closure, skipping the memset()
+ *
+ * May be used instead of closure_init() when memory has already been zeroed.
+ */
+#define __closure_init(cl, parent)                             \
+       closure_init_type(cl, parent, true)
+
+/**
+ * closure_init() - Initialize a closure, setting the refcount to 1
+ * @cl:                closure to initialize
+ * @parent:    parent of the new closure. cl will take a refcount on it for its
+ *             lifetime; may be NULL.
+ */
+#define closure_init(cl, parent)                               \
+do {                                                           \
+       memset((cl), 0, sizeof(*(cl)));                         \
+       __closure_init(cl, parent);                             \
+} while (0)
+
+static inline void closure_init_stack(struct closure *cl)
+{
+       memset(cl, 0, sizeof(struct closure));
+       atomic_set(&cl->remaining, CLOSURE_REMAINING_INITIALIZER|
+                  CLOSURE_BLOCKING|CLOSURE_STACK);
+}
+
+/**
+ * closure_init_unlocked() - Initialize a closure but leave it unlocked.
+ * @cl:                closure to initialize
+ *
+ * For when the closure will be used as a lock. The closure may not be used
+ * until after a closure_lock() or closure_trylock().
+ */
+#define closure_init_unlocked(cl)                              \
+do {                                                           \
+       memset((cl), 0, sizeof(*(cl)));                         \
+       closure_init_type(cl, NULL, false);                     \
+} while (0)
+
+/**
+ * closure_lock() - lock and initialize a closure.
+ * @cl:                the closure to lock
+ * @parent:    the new parent for this closure
+ *
+ * The closure must be of one of the types that has a waitlist (otherwise we
+ * wouldn't be able to sleep on contention).
+ *
+ * @parent has exactly the same meaning as in closure_init(); if non null, the
+ * closure will take a reference on @parent which will be released when it is
+ * unlocked.
+ */
+#define closure_lock(cl, parent)                               \
+       __closure_lock(__to_internal_closure(cl), parent, &(cl)->wait)
+
+/**
+ * closure_delay() - delay some number of jiffies
+ * @cl:                the closure that will sleep
+ * @delay:     the delay in jiffies
+ *
+ * Takes a refcount on @cl which will be released after @delay jiffies; this may
+ * be used to have a function run after a delay with continue_at(), or
+ * closure_sync() may be used for a convoluted version of msleep().
+ */
+#define closure_delay(cl, delay)                       \
+       __closure_delay(__to_internal_closure(cl), delay, &(cl)->timer)
+
+#define closure_flush(cl)                              \
+       __closure_flush(__to_internal_closure(cl), &(cl)->timer)
+
+#define closure_flush_sync(cl)                         \
+       __closure_flush_sync(__to_internal_closure(cl), &(cl)->timer)
+
+static inline void __closure_end_sleep(struct closure *cl)
+{
+       __set_current_state(TASK_RUNNING);
+
+       if (atomic_read(&cl->remaining) & CLOSURE_SLEEPING)
+               atomic_sub(CLOSURE_SLEEPING, &cl->remaining);
+}
+
+static inline void __closure_start_sleep(struct closure *cl)
+{
+       closure_set_ip(cl);
+       cl->task = current;
+       set_current_state(TASK_UNINTERRUPTIBLE);
+
+       if (!(atomic_read(&cl->remaining) & CLOSURE_SLEEPING))
+               atomic_add(CLOSURE_SLEEPING, &cl->remaining);
+}
+
+/**
+ * closure_blocking() - returns true if the closure is in blocking mode.
+ *
+ * If a closure is in blocking mode, closure_wait_event() will sleep until the
+ * condition is true instead of waiting asynchronously.
+ */
+static inline bool closure_blocking(struct closure *cl)
+{
+       return atomic_read(&cl->remaining) & CLOSURE_BLOCKING;
+}
+
+/**
+ * set_closure_blocking() - put a closure in blocking mode.
+ *
+ * If a closure is in blocking mode, closure_wait_event() will sleep until the
+ * condition is true instead of waiting asynchronously.
+ *
+ * Not thread safe - can only be called by the thread running the closure.
+ */
+static inline void set_closure_blocking(struct closure *cl)
+{
+       if (!closure_blocking(cl))
+               atomic_add(CLOSURE_BLOCKING, &cl->remaining);
+}
+
+/*
+ * Not thread safe - can only be called by the thread running the closure.
+ */
+static inline void clear_closure_blocking(struct closure *cl)
+{
+       if (closure_blocking(cl))
+               atomic_sub(CLOSURE_BLOCKING, &cl->remaining);
+}
+
+/**
+ * closure_wake_up() - wake up all closures on a wait list.
+ */
+static inline void closure_wake_up(struct closure_waitlist *list)
+{
+       smp_mb();
+       __closure_wake_up(list);
+}
+
+/*
+ * Wait on an event, synchronously or asynchronously - analogous to wait_event()
+ * but for closures.
+ *
+ * The loop is oddly structured so as to avoid a race; we must check the
+ * condition again after we've added ourself to the waitlist. We know if we were
+ * already on the waitlist because closure_wait() returns false; thus, we only
+ * schedule or break if closure_wait() returns false. If it returns true, we
+ * just loop again - rechecking the condition.
+ *
+ * The __closure_wake_up() is necessary because we may race with the event
+ * becoming true; i.e. we see event false -> wait -> recheck condition, but the
+ * thread that made the event true may have called closure_wake_up() before we
+ * added ourself to the wait list.
+ *
+ * We have to call closure_sync() at the end instead of just
+ * __closure_end_sleep() because a different thread might've called
+ * closure_wake_up() before us and gotten preempted before they dropped the
+ * refcount on our closure. If this was a stack allocated closure, that would be
+ * bad.
+ */
+#define __closure_wait_event(list, cl, condition, _block)              \
+({                                                                     \
+       bool block = _block;                                            \
+       typeof(condition) ret;                                          \
+                                                                       \
+       while (1) {                                                     \
+               ret = (condition);                                      \
+               if (ret) {                                              \
+                       __closure_wake_up(list);                        \
+                       if (block)                                      \
+                               closure_sync(cl);                       \
+                                                                       \
+                       break;                                          \
+               }                                                       \
+                                                                       \
+               if (block)                                              \
+                       __closure_start_sleep(cl);                      \
+                                                                       \
+               if (!closure_wait(list, cl)) {                          \
+                       if (!block)                                     \
+                               break;                                  \
+                                                                       \
+                       schedule();                                     \
+               }                                                       \
+       }                                                               \
+                                                                       \
+       ret;                                                            \
+})
+
+/**
+ * closure_wait_event() - wait on a condition, synchronously or asynchronously.
+ * @list:      the wait list to wait on
+ * @cl:                the closure that is doing the waiting
+ * @condition: a C expression for the event to wait for
+ *
+ * If the closure is in blocking mode, sleeps until the @condition evaluates to
+ * true - exactly like wait_event().
+ *
+ * If the closure is not in blocking mode, waits asynchronously; if the
+ * condition is currently false the @cl is put onto @list and returns. @list
+ * owns a refcount on @cl; closure_sync() or continue_at() may be used later to
+ * wait for another thread to wake up @list, which drops the refcount on @cl.
+ *
+ * Returns the value of @condition; @cl will be on @list iff @condition was
+ * false.
+ *
+ * closure_wake_up(@list) must be called after changing any variable that could
+ * cause @condition to become true.
+ */
+#define closure_wait_event(list, cl, condition)                                \
+       __closure_wait_event(list, cl, condition, closure_blocking(cl))
+
+#define closure_wait_event_async(list, cl, condition)                  \
+       __closure_wait_event(list, cl, condition, false)
+
+#define closure_wait_event_sync(list, cl, condition)                   \
+       __closure_wait_event(list, cl, condition, true)
+
+static inline void set_closure_fn(struct closure *cl, closure_fn *fn,
+                                 struct workqueue_struct *wq)
+{
+       BUG_ON(object_is_on_stack(cl));
+       closure_set_ip(cl);
+       cl->fn = fn;
+       cl->wq = wq;
+       /* between atomic_dec() in closure_put() */
+       smp_mb__before_atomic_dec();
+}
+
+#define continue_at(_cl, _fn, _wq)                                     \
+do {                                                                   \
+       set_closure_fn(_cl, _fn, _wq);                                  \
+       closure_sub(_cl, CLOSURE_RUNNING + 1);                          \
+       return;                                                         \
+} while (0)
+
+#define closure_return(_cl)    continue_at((_cl), NULL, NULL)
+
+#define continue_at_nobarrier(_cl, _fn, _wq)                           \
+do {                                                                   \
+       set_closure_fn(_cl, _fn, _wq);                                  \
+       closure_queue(cl);                                              \
+       return;                                                         \
+} while (0)
+
+#define closure_return_with_destructor(_cl, _destructor)               \
+do {                                                                   \
+       set_closure_fn(_cl, _destructor, NULL);                         \
+       closure_sub(_cl, CLOSURE_RUNNING - CLOSURE_DESTRUCTOR + 1);     \
+       return;                                                         \
+} while (0)
+
+static inline void closure_call(struct closure *cl, closure_fn fn,
+                               struct workqueue_struct *wq,
+                               struct closure *parent)
+{
+       closure_init(cl, parent);
+       continue_at_nobarrier(cl, fn, wq);
+}
+
+static inline void closure_trylock_call(struct closure *cl, closure_fn fn,
+                                       struct workqueue_struct *wq,
+                                       struct closure *parent)
+{
+       if (closure_trylock(cl, parent))
+               continue_at_nobarrier(cl, fn, wq);
+}
+
+#endif /* _LINUX_CLOSURE_H */
diff --git a/drivers/md/bcache/debug.c b/drivers/md/bcache/debug.c

new file mode 100644 (file)

index 0000000..89fd520
--- /dev/null
+++ b/drivers/md/bcache/debug.c
@@ -0,0 +1,565 @@
+/*
+ * Assorted bcache debug code
+ *
+ * Copyright 2010, 2011 Kent Overstreet <kent.overstreet@gmail.com>
+ * Copyright 2012 Google, Inc.
+ */
+
+#include "bcache.h"
+#include "btree.h"
+#include "debug.h"
+#include "request.h"
+
+#include <linux/console.h>
+#include <linux/debugfs.h>
+#include <linux/module.h>
+#include <linux/random.h>
+#include <linux/seq_file.h>
+
+static struct dentry *debug;
+
+const char *bch_ptr_status(struct cache_set *c, const struct bkey *k)
+{
+       unsigned i;
+
+       for (i = 0; i < KEY_PTRS(k); i++)
+               if (ptr_available(c, k, i)) {
+                       struct cache *ca = PTR_CACHE(c, k, i);
+                       size_t bucket = PTR_BUCKET_NR(c, k, i);
+                       size_t r = bucket_remainder(c, PTR_OFFSET(k, i));
+
+                       if (KEY_SIZE(k) + r > c->sb.bucket_size)
+                               return "bad, length too big";
+                       if (bucket <  ca->sb.first_bucket)
+                               return "bad, short offset";
+                       if (bucket >= ca->sb.nbuckets)
+                               return "bad, offset past end of device";
+                       if (ptr_stale(c, k, i))
+                               return "stale";
+               }
+
+       if (!bkey_cmp(k, &ZERO_KEY))
+               return "bad, null key";
+       if (!KEY_PTRS(k))
+               return "bad, no pointers";
+       if (!KEY_SIZE(k))
+               return "zeroed key";
+       return "";
+}
+
+struct keyprint_hack bch_pkey(const struct bkey *k)
+{
+       unsigned i = 0;
+       struct keyprint_hack r;
+       char *out = r.s, *end = r.s + KEYHACK_SIZE;
+
+#define p(...) (out += scnprintf(out, end - out, __VA_ARGS__))
+
+       p("%llu:%llu len %llu -> [", KEY_INODE(k), KEY_OFFSET(k), KEY_SIZE(k));
+
+       if (KEY_PTRS(k))
+               while (1) {
+                       p("%llu:%llu gen %llu",
+                         PTR_DEV(k, i), PTR_OFFSET(k, i), PTR_GEN(k, i));
+
+                       if (++i == KEY_PTRS(k))
+                               break;
+
+                       p(", ");
+               }
+
+       p("]");
+
+       if (KEY_DIRTY(k))
+               p(" dirty");
+       if (KEY_CSUM(k))
+               p(" cs%llu %llx", KEY_CSUM(k), k->ptr[1]);
+#undef p
+       return r;
+}
+
+struct keyprint_hack bch_pbtree(const struct btree *b)
+{
+       struct keyprint_hack r;
+
+       snprintf(r.s, 40, "%zu level %i/%i", PTR_BUCKET_NR(b->c, &b->key, 0),
+                b->level, b->c->root ? b->c->root->level : -1);
+       return r;
+}
+
+#if defined(CONFIG_BCACHE_DEBUG) || defined(CONFIG_BCACHE_EDEBUG)
+
+static bool skipped_backwards(struct btree *b, struct bkey *k)
+{
+       return bkey_cmp(k, (!b->level)
+                       ? &START_KEY(bkey_next(k))
+                       : bkey_next(k)) > 0;
+}
+
+static void dump_bset(struct btree *b, struct bset *i)
+{
+       struct bkey *k;
+       unsigned j;
+
+       for (k = i->start; k < end(i); k = bkey_next(k)) {
+               printk(KERN_ERR "block %zu key %zi/%u: %s", index(i, b),
+                      (uint64_t *) k - i->d, i->keys, pkey(k));
+
+               for (j = 0; j < KEY_PTRS(k); j++) {
+                       size_t n = PTR_BUCKET_NR(b->c, k, j);
+                       printk(" bucket %zu", n);
+
+                       if (n >= b->c->sb.first_bucket && n < b->c->sb.nbuckets)
+                               printk(" prio %i",
+                                      PTR_BUCKET(b->c, k, j)->prio);
+               }
+
+               printk(" %s\n", bch_ptr_status(b->c, k));
+
+               if (bkey_next(k) < end(i) &&
+                   skipped_backwards(b, k))
+                       printk(KERN_ERR "Key skipped backwards\n");
+       }
+}
+
+#endif
+
+#ifdef CONFIG_BCACHE_DEBUG
+
+void bch_btree_verify(struct btree *b, struct bset *new)
+{
+       struct btree *v = b->c->verify_data;
+       struct closure cl;
+       closure_init_stack(&cl);
+
+       if (!b->c->verify)
+               return;
+
+       closure_wait_event(&b->io.wait, &cl,
+                          atomic_read(&b->io.cl.remaining) == -1);
+
+       mutex_lock(&b->c->verify_lock);
+
+       bkey_copy(&v->key, &b->key);
+       v->written = 0;
+       v->level = b->level;
+
+       bch_btree_read(v);
+       closure_wait_event(&v->io.wait, &cl,
+                          atomic_read(&b->io.cl.remaining) == -1);
+
+       if (new->keys != v->sets[0].data->keys ||
+           memcmp(new->start,
+                  v->sets[0].data->start,
+                  (void *) end(new) - (void *) new->start)) {
+               unsigned i, j;
+
+               console_lock();
+
+               printk(KERN_ERR "*** original memory node:\n");
+               for (i = 0; i <= b->nsets; i++)
+                       dump_bset(b, b->sets[i].data);
+
+               printk(KERN_ERR "*** sorted memory node:\n");
+               dump_bset(b, new);
+
+               printk(KERN_ERR "*** on disk node:\n");
+               dump_bset(v, v->sets[0].data);
+
+               for (j = 0; j < new->keys; j++)
+                       if (new->d[j] != v->sets[0].data->d[j])
+                               break;
+
+               console_unlock();
+               panic("verify failed at %u\n", j);
+       }
+
+       mutex_unlock(&b->c->verify_lock);
+}
+
+static void data_verify_endio(struct bio *bio, int error)
+{
+       struct closure *cl = bio->bi_private;
+       closure_put(cl);
+}
+
+void bch_data_verify(struct search *s)
+{
+       char name[BDEVNAME_SIZE];
+       struct cached_dev *dc = container_of(s->d, struct cached_dev, disk);
+       struct closure *cl = &s->cl;
+       struct bio *check;
+       struct bio_vec *bv;
+       int i;
+
+       if (!s->unaligned_bvec)
+               bio_for_each_segment(bv, s->orig_bio, i)
+                       bv->bv_offset = 0, bv->bv_len = PAGE_SIZE;
+
+       check = bio_clone(s->orig_bio, GFP_NOIO);
+       if (!check)
+               return;
+
+       if (bch_bio_alloc_pages(check, GFP_NOIO))
+               goto out_put;
+
+       check->bi_rw            = READ_SYNC;
+       check->bi_private       = cl;
+       check->bi_end_io        = data_verify_endio;
+
+       closure_bio_submit(check, cl, &dc->disk);
+       closure_sync(cl);
+
+       bio_for_each_segment(bv, s->orig_bio, i) {
+               void *p1 = kmap(bv->bv_page);
+               void *p2 = kmap(check->bi_io_vec[i].bv_page);
+
+               if (memcmp(p1 + bv->bv_offset,
+                          p2 + bv->bv_offset,
+                          bv->bv_len))
+                       printk(KERN_ERR
+                              "bcache (%s): verify failed at sector %llu\n",
+                              bdevname(dc->bdev, name),
+                              (uint64_t) s->orig_bio->bi_sector);
+
+               kunmap(bv->bv_page);
+               kunmap(check->bi_io_vec[i].bv_page);
+       }
+
+       __bio_for_each_segment(bv, check, i, 0)
+               __free_page(bv->bv_page);
+out_put:
+       bio_put(check);
+}
+
+#endif
+
+#ifdef CONFIG_BCACHE_EDEBUG
+
+unsigned bch_count_data(struct btree *b)
+{
+       unsigned ret = 0;
+       struct btree_iter iter;
+       struct bkey *k;
+
+       if (!b->level)
+               for_each_key(b, k, &iter)
+                       ret += KEY_SIZE(k);
+       return ret;
+}
+
+static void vdump_bucket_and_panic(struct btree *b, const char *fmt,
+                                  va_list args)
+{
+       unsigned i;
+
+       console_lock();
+
+       for (i = 0; i <= b->nsets; i++)
+               dump_bset(b, b->sets[i].data);
+
+       vprintk(fmt, args);
+
+       console_unlock();
+
+       panic("at %s\n", pbtree(b));
+}
+
+void bch_check_key_order_msg(struct btree *b, struct bset *i,
+                            const char *fmt, ...)
+{
+       struct bkey *k;
+
+       if (!i->keys)
+               return;
+
+       for (k = i->start; bkey_next(k) < end(i); k = bkey_next(k))
+               if (skipped_backwards(b, k)) {
+                       va_list args;
+                       va_start(args, fmt);
+
+                       vdump_bucket_and_panic(b, fmt, args);
+                       va_end(args);
+               }
+}
+
+void bch_check_keys(struct btree *b, const char *fmt, ...)
+{
+       va_list args;
+       struct bkey *k, *p = NULL;
+       struct btree_iter iter;
+
+       if (b->level)
+               return;
+
+       for_each_key(b, k, &iter) {
+               if (p && bkey_cmp(&START_KEY(p), &START_KEY(k)) > 0) {
+                       printk(KERN_ERR "Keys out of order:\n");
+                       goto bug;
+               }
+
+               if (bch_ptr_invalid(b, k))
+                       continue;
+
+               if (p && bkey_cmp(p, &START_KEY(k)) > 0) {
+                       printk(KERN_ERR "Overlapping keys:\n");
+                       goto bug;
+               }
+               p = k;
+       }
+       return;
+bug:
+       va_start(args, fmt);
+       vdump_bucket_and_panic(b, fmt, args);
+       va_end(args);
+}
+
+#endif
+
+#ifdef CONFIG_DEBUG_FS
+
+/* XXX: cache set refcounting */
+
+struct dump_iterator {
+       char                    buf[PAGE_SIZE];
+       size_t                  bytes;
+       struct cache_set        *c;
+       struct keybuf           keys;
+};
+
+static bool dump_pred(struct keybuf *buf, struct bkey *k)
+{
+       return true;
+}
+
+static ssize_t bch_dump_read(struct file *file, char __user *buf,
+                            size_t size, loff_t *ppos)
+{
+       struct dump_iterator *i = file->private_data;
+       ssize_t ret = 0;
+
+       while (size) {
+               struct keybuf_key *w;
+               unsigned bytes = min(i->bytes, size);
+
+               int err = copy_to_user(buf, i->buf, bytes);
+               if (err)
+                       return err;
+
+               ret      += bytes;
+               buf      += bytes;
+               size     -= bytes;
+               i->bytes -= bytes;
+               memmove(i->buf, i->buf + bytes, i->bytes);
+
+               if (i->bytes)
+                       break;
+
+               w = bch_keybuf_next_rescan(i->c, &i->keys, &MAX_KEY);
+               if (!w)
+                       break;
+
+               i->bytes = snprintf(i->buf, PAGE_SIZE, "%s\n", pkey(&w->key));
+               bch_keybuf_del(&i->keys, w);
+       }
+
+       return ret;
+}
+
+static int bch_dump_open(struct inode *inode, struct file *file)
+{
+       struct cache_set *c = inode->i_private;
+       struct dump_iterator *i;
+
+       i = kzalloc(sizeof(struct dump_iterator), GFP_KERNEL);
+       if (!i)
+               return -ENOMEM;
+
+       file->private_data = i;
+       i->c = c;
+       bch_keybuf_init(&i->keys, dump_pred);
+       i->keys.last_scanned = KEY(0, 0, 0);
+
+       return 0;
+}
+
+static int bch_dump_release(struct inode *inode, struct file *file)
+{
+       kfree(file->private_data);
+       return 0;
+}
+
+static const struct file_operations cache_set_debug_ops = {
+       .owner          = THIS_MODULE,
+       .open           = bch_dump_open,
+       .read           = bch_dump_read,
+       .release        = bch_dump_release
+};
+
+void bch_debug_init_cache_set(struct cache_set *c)
+{
+       if (!IS_ERR_OR_NULL(debug)) {
+               char name[50];
+               snprintf(name, 50, "bcache-%pU", c->sb.set_uuid);
+
+               c->debug = debugfs_create_file(name, 0400, debug, c,
+                                              &cache_set_debug_ops);
+       }
+}
+
+#endif
+
+/* Fuzz tester has rotted: */
+#if 0
+
+static ssize_t btree_fuzz(struct kobject *k, struct kobj_attribute *a,
+                         const char *buffer, size_t size)
+{
+       void dump(struct btree *b)
+       {
+               struct bset *i;
+
+               for (i = b->sets[0].data;
+                    index(i, b) < btree_blocks(b) &&
+                    i->seq == b->sets[0].data->seq;
+                    i = ((void *) i) + set_blocks(i, b->c) * block_bytes(b->c))
+                       dump_bset(b, i);
+       }
+
+       struct cache_sb *sb;
+       struct cache_set *c;
+       struct btree *all[3], *b, *fill, *orig;
+       int j;
+
+       struct btree_op op;
+       bch_btree_op_init_stack(&op);
+
+       sb = kzalloc(sizeof(struct cache_sb), GFP_KERNEL);
+       if (!sb)
+               return -ENOMEM;
+
+       sb->bucket_size = 128;
+       sb->block_size = 4;
+
+       c = bch_cache_set_alloc(sb);
+       if (!c)
+               return -ENOMEM;
+
+       for (j = 0; j < 3; j++) {
+               BUG_ON(list_empty(&c->btree_cache));
+               all[j] = list_first_entry(&c->btree_cache, struct btree, list);
+               list_del_init(&all[j]->list);
+
+               all[j]->key = KEY(0, 0, c->sb.bucket_size);
+               bkey_copy_key(&all[j]->key, &MAX_KEY);
+       }
+
+       b = all[0];
+       fill = all[1];
+       orig = all[2];
+
+       while (1) {
+               for (j = 0; j < 3; j++)
+                       all[j]->written = all[j]->nsets = 0;
+
+               bch_bset_init_next(b);
+
+               while (1) {
+                       struct bset *i = write_block(b);
+                       struct bkey *k = op.keys.top;
+                       unsigned rand;
+
+                       bkey_init(k);
+                       rand = get_random_int();
+
+                       op.type = rand & 1
+                               ? BTREE_INSERT
+                               : BTREE_REPLACE;
+                       rand >>= 1;
+
+                       SET_KEY_SIZE(k, bucket_remainder(c, rand));
+                       rand >>= c->bucket_bits;
+                       rand &= 1024 * 512 - 1;
+                       rand += c->sb.bucket_size;
+                       SET_KEY_OFFSET(k, rand);
+#if 0
+                       SET_KEY_PTRS(k, 1);
+#endif
+                       bch_keylist_push(&op.keys);
+                       bch_btree_insert_keys(b, &op);
+
+                       if (should_split(b) ||
+                           set_blocks(i, b->c) !=
+                           __set_blocks(i, i->keys + 15, b->c)) {
+                               i->csum = csum_set(i);
+
+                               memcpy(write_block(fill),
+                                      i, set_bytes(i));
+
+                               b->written += set_blocks(i, b->c);
+                               fill->written = b->written;
+                               if (b->written == btree_blocks(b))
+                                       break;
+
+                               bch_btree_sort_lazy(b);
+                               bch_bset_init_next(b);
+                       }
+               }
+
+               memcpy(orig->sets[0].data,
+                      fill->sets[0].data,
+                      btree_bytes(c));
+
+               bch_btree_sort(b);
+               fill->written = 0;
+               bch_btree_read_done(&fill->io.cl);
+
+               if (b->sets[0].data->keys != fill->sets[0].data->keys ||
+                   memcmp(b->sets[0].data->start,
+                          fill->sets[0].data->start,
+                          b->sets[0].data->keys * sizeof(uint64_t))) {
+                       struct bset *i = b->sets[0].data;
+                       struct bkey *k, *l;
+
+                       for (k = i->start,
+                            l = fill->sets[0].data->start;
+                            k < end(i);
+                            k = bkey_next(k), l = bkey_next(l))
+                               if (bkey_cmp(k, l) ||
+                                   KEY_SIZE(k) != KEY_SIZE(l))
+                                       pr_err("key %zi differs: %s != %s",
+                                              (uint64_t *) k - i->d,
+                                              pkey(k), pkey(l));
+
+                       for (j = 0; j < 3; j++) {
+                               pr_err("**** Set %i ****", j);
+                               dump(all[j]);
+                       }
+                       panic("\n");
+               }
+
+               pr_info("fuzz complete: %i keys", b->sets[0].data->keys);
+       }
+}
+
+kobj_attribute_write(fuzz, btree_fuzz);
+#endif
+
+void bch_debug_exit(void)
+{
+       if (!IS_ERR_OR_NULL(debug))
+               debugfs_remove_recursive(debug);
+}
+
+int __init bch_debug_init(struct kobject *kobj)
+{
+       int ret = 0;
+#if 0
+       ret = sysfs_create_file(kobj, &ksysfs_fuzz.attr);
+       if (ret)
+               return ret;
+#endif
+
+       debug = debugfs_create_dir("bcache", NULL);
+       return ret;
+}
diff --git a/drivers/md/bcache/debug.h b/drivers/md/bcache/debug.h

new file mode 100644 (file)

index 0000000..f9378a2
--- /dev/null
+++ b/drivers/md/bcache/debug.h
@@ -0,0 +1,54 @@
+#ifndef _BCACHE_DEBUG_H
+#define _BCACHE_DEBUG_H
+
+/* Btree/bkey debug printing */
+
+#define KEYHACK_SIZE 80
+struct keyprint_hack {
+       char s[KEYHACK_SIZE];
+};
+
+struct keyprint_hack bch_pkey(const struct bkey *k);
+struct keyprint_hack bch_pbtree(const struct btree *b);
+#define pkey(k)                (&bch_pkey(k).s[0])
+#define pbtree(b)      (&bch_pbtree(b).s[0])
+
+#ifdef CONFIG_BCACHE_EDEBUG
+
+unsigned bch_count_data(struct btree *);
+void bch_check_key_order_msg(struct btree *, struct bset *, const char *, ...);
+void bch_check_keys(struct btree *, const char *, ...);
+
+#define bch_check_key_order(b, i)                      \
+       bch_check_key_order_msg(b, i, "keys out of order")
+#define EBUG_ON(cond)          BUG_ON(cond)
+
+#else /* EDEBUG */
+
+#define bch_count_data(b)                              0
+#define bch_check_key_order(b, i)                      do {} while (0)
+#define bch_check_key_order_msg(b, i, ...)             do {} while (0)
+#define bch_check_keys(b, ...)                         do {} while (0)
+#define EBUG_ON(cond)                                  do {} while (0)
+
+#endif
+
+#ifdef CONFIG_BCACHE_DEBUG
+
+void bch_btree_verify(struct btree *, struct bset *);
+void bch_data_verify(struct search *);
+
+#else /* DEBUG */
+
+static inline void bch_btree_verify(struct btree *b, struct bset *i) {}
+static inline void bch_data_verify(struct search *s) {};
+
+#endif
+
+#ifdef CONFIG_DEBUG_FS
+void bch_debug_init_cache_set(struct cache_set *);
+#else
+static inline void bch_debug_init_cache_set(struct cache_set *c) {}
+#endif
+
+#endif
diff --git a/drivers/md/bcache/io.c b/drivers/md/bcache/io.c

new file mode 100644 (file)

index 0000000..48efd4d
--- /dev/null
+++ b/drivers/md/bcache/io.c
@@ -0,0 +1,397 @@
+/*
+ * Some low level IO code, and hacks for various block layer limitations
+ *
+ * Copyright 2010, 2011 Kent Overstreet <kent.overstreet@gmail.com>
+ * Copyright 2012 Google, Inc.
+ */
+
+#include "bcache.h"
+#include "bset.h"
+#include "debug.h"
+
+static void bch_bi_idx_hack_endio(struct bio *bio, int error)
+{
+       struct bio *p = bio->bi_private;
+
+       bio_endio(p, error);
+       bio_put(bio);
+}
+
+static void bch_generic_make_request_hack(struct bio *bio)
+{
+       if (bio->bi_idx) {
+               struct bio *clone = bio_alloc(GFP_NOIO, bio_segments(bio));
+
+               memcpy(clone->bi_io_vec,
+                      bio_iovec(bio),
+                      bio_segments(bio) * sizeof(struct bio_vec));
+
+               clone->bi_sector        = bio->bi_sector;
+               clone->bi_bdev          = bio->bi_bdev;
+               clone->bi_rw            = bio->bi_rw;
+               clone->bi_vcnt          = bio_segments(bio);
+               clone->bi_size          = bio->bi_size;
+
+               clone->bi_private       = bio;
+               clone->bi_end_io        = bch_bi_idx_hack_endio;
+
+               bio = clone;
+       }
+
+       /*
+        * Hack, since drivers that clone bios clone up to bi_max_vecs, but our
+        * bios might have had more than that (before we split them per device
+        * limitations).
+        *
+        * To be taken out once immutable bvec stuff is in.
+        */
+       bio->bi_max_vecs = bio->bi_vcnt;
+
+       generic_make_request(bio);
+}
+
+/**
+ * bch_bio_split - split a bio
+ * @bio:       bio to split
+ * @sectors:   number of sectors to split from the front of @bio
+ * @gfp:       gfp mask
+ * @bs:                bio set to allocate from
+ *
+ * Allocates and returns a new bio which represents @sectors from the start of
+ * @bio, and updates @bio to represent the remaining sectors.
+ *
+ * If bio_sectors(@bio) was less than or equal to @sectors, returns @bio
+ * unchanged.
+ *
+ * The newly allocated bio will point to @bio's bi_io_vec, if the split was on a
+ * bvec boundry; it is the caller's responsibility to ensure that @bio is not
+ * freed before the split.
+ *
+ * If bch_bio_split() is running under generic_make_request(), it's not safe to
+ * allocate more than one bio from the same bio set. Therefore, if it is running
+ * under generic_make_request() it masks out __GFP_WAIT when doing the
+ * allocation. The caller must check for failure if there's any possibility of
+ * it being called from under generic_make_request(); it is then the caller's
+ * responsibility to retry from a safe context (by e.g. punting to workqueue).
+ */
+struct bio *bch_bio_split(struct bio *bio, int sectors,
+                         gfp_t gfp, struct bio_set *bs)
+{
+       unsigned idx = bio->bi_idx, vcnt = 0, nbytes = sectors << 9;
+       struct bio_vec *bv;
+       struct bio *ret = NULL;
+
+       BUG_ON(sectors <= 0);
+
+       /*
+        * If we're being called from underneath generic_make_request() and we
+        * already allocated any bios from this bio set, we risk deadlock if we
+        * use the mempool. So instead, we possibly fail and let the caller punt
+        * to workqueue or somesuch and retry in a safe context.
+        */
+       if (current->bio_list)
+               gfp &= ~__GFP_WAIT;
+
+       if (sectors >= bio_sectors(bio))
+               return bio;
+
+       if (bio->bi_rw & REQ_DISCARD) {
+               ret = bio_alloc_bioset(gfp, 1, bs);
+               idx = 0;
+               goto out;
+       }
+
+       bio_for_each_segment(bv, bio, idx) {
+               vcnt = idx - bio->bi_idx;
+
+               if (!nbytes) {
+                       ret = bio_alloc_bioset(gfp, vcnt, bs);
+                       if (!ret)
+                               return NULL;
+
+                       memcpy(ret->bi_io_vec, bio_iovec(bio),
+                              sizeof(struct bio_vec) * vcnt);
+
+                       break;
+               } else if (nbytes < bv->bv_len) {
+                       ret = bio_alloc_bioset(gfp, ++vcnt, bs);
+                       if (!ret)
+                               return NULL;
+
+                       memcpy(ret->bi_io_vec, bio_iovec(bio),
+                              sizeof(struct bio_vec) * vcnt);
+
+                       ret->bi_io_vec[vcnt - 1].bv_len = nbytes;
+                       bv->bv_offset   += nbytes;
+                       bv->bv_len      -= nbytes;
+                       break;
+               }
+
+               nbytes -= bv->bv_len;
+       }
+out:
+       ret->bi_bdev    = bio->bi_bdev;
+       ret->bi_sector  = bio->bi_sector;
+       ret->bi_size    = sectors << 9;
+       ret->bi_rw      = bio->bi_rw;
+       ret->bi_vcnt    = vcnt;
+       ret->bi_max_vecs = vcnt;
+
+       bio->bi_sector  += sectors;
+       bio->bi_size    -= sectors << 9;
+       bio->bi_idx      = idx;
+
+       if (bio_integrity(bio)) {
+               if (bio_integrity_clone(ret, bio, gfp)) {
+                       bio_put(ret);
+                       return NULL;
+               }
+
+               bio_integrity_trim(ret, 0, bio_sectors(ret));
+               bio_integrity_trim(bio, bio_sectors(ret), bio_sectors(bio));
+       }
+
+       return ret;
+}
+
+static unsigned bch_bio_max_sectors(struct bio *bio)
+{
+       unsigned ret = bio_sectors(bio);
+       struct request_queue *q = bdev_get_queue(bio->bi_bdev);
+       unsigned max_segments = min_t(unsigned, BIO_MAX_PAGES,
+                                     queue_max_segments(q));
+       struct bio_vec *bv, *end = bio_iovec(bio) +
+               min_t(int, bio_segments(bio), max_segments);
+
+       if (bio->bi_rw & REQ_DISCARD)
+               return min(ret, q->limits.max_discard_sectors);
+
+       if (bio_segments(bio) > max_segments ||
+           q->merge_bvec_fn) {
+               ret = 0;
+
+               for (bv = bio_iovec(bio); bv < end; bv++) {
+                       struct bvec_merge_data bvm = {
+                               .bi_bdev        = bio->bi_bdev,
+                               .bi_sector      = bio->bi_sector,
+                               .bi_size        = ret << 9,
+                               .bi_rw          = bio->bi_rw,
+                       };
+
+                       if (q->merge_bvec_fn &&
+                           q->merge_bvec_fn(q, &bvm, bv) < (int) bv->bv_len)
+                               break;
+
+                       ret += bv->bv_len >> 9;
+               }
+       }
+
+       ret = min(ret, queue_max_sectors(q));
+
+       WARN_ON(!ret);
+       ret = max_t(int, ret, bio_iovec(bio)->bv_len >> 9);
+
+       return ret;
+}
+
+static void bch_bio_submit_split_done(struct closure *cl)
+{
+       struct bio_split_hook *s = container_of(cl, struct bio_split_hook, cl);
+
+       s->bio->bi_end_io = s->bi_end_io;
+       s->bio->bi_private = s->bi_private;
+       bio_endio(s->bio, 0);
+
+       closure_debug_destroy(&s->cl);
+       mempool_free(s, s->p->bio_split_hook);
+}
+
+static void bch_bio_submit_split_endio(struct bio *bio, int error)
+{
+       struct closure *cl = bio->bi_private;
+       struct bio_split_hook *s = container_of(cl, struct bio_split_hook, cl);
+
+       if (error)
+               clear_bit(BIO_UPTODATE, &s->bio->bi_flags);
+
+       bio_put(bio);
+       closure_put(cl);
+}
+
+static void __bch_bio_submit_split(struct closure *cl)
+{
+       struct bio_split_hook *s = container_of(cl, struct bio_split_hook, cl);
+       struct bio *bio = s->bio, *n;
+
+       do {
+               n = bch_bio_split(bio, bch_bio_max_sectors(bio),
+                                 GFP_NOIO, s->p->bio_split);
+               if (!n)
+                       continue_at(cl, __bch_bio_submit_split, system_wq);
+
+               n->bi_end_io    = bch_bio_submit_split_endio;
+               n->bi_private   = cl;
+
+               closure_get(cl);
+               bch_generic_make_request_hack(n);
+       } while (n != bio);
+
+       continue_at(cl, bch_bio_submit_split_done, NULL);
+}
+
+void bch_generic_make_request(struct bio *bio, struct bio_split_pool *p)
+{
+       struct bio_split_hook *s;
+
+       if (!bio_has_data(bio) && !(bio->bi_rw & REQ_DISCARD))
+               goto submit;
+
+       if (bio_sectors(bio) <= bch_bio_max_sectors(bio))
+               goto submit;
+
+       s = mempool_alloc(p->bio_split_hook, GFP_NOIO);
+
+       s->bio          = bio;
+       s->p            = p;
+       s->bi_end_io    = bio->bi_end_io;
+       s->bi_private   = bio->bi_private;
+       bio_get(bio);
+
+       closure_call(&s->cl, __bch_bio_submit_split, NULL, NULL);
+       return;
+submit:
+       bch_generic_make_request_hack(bio);
+}
+
+/* Bios with headers */
+
+void bch_bbio_free(struct bio *bio, struct cache_set *c)
+{
+       struct bbio *b = container_of(bio, struct bbio, bio);
+       mempool_free(b, c->bio_meta);
+}
+
+struct bio *bch_bbio_alloc(struct cache_set *c)
+{
+       struct bbio *b = mempool_alloc(c->bio_meta, GFP_NOIO);
+       struct bio *bio = &b->bio;
+
+       bio_init(bio);
+       bio->bi_flags           |= BIO_POOL_NONE << BIO_POOL_OFFSET;
+       bio->bi_max_vecs         = bucket_pages(c);
+       bio->bi_io_vec           = bio->bi_inline_vecs;
+
+       return bio;
+}
+
+void __bch_submit_bbio(struct bio *bio, struct cache_set *c)
+{
+       struct bbio *b = container_of(bio, struct bbio, bio);
+
+       bio->bi_sector  = PTR_OFFSET(&b->key, 0);
+       bio->bi_bdev    = PTR_CACHE(c, &b->key, 0)->bdev;
+
+       b->submit_time_us = local_clock_us();
+       closure_bio_submit(bio, bio->bi_private, PTR_CACHE(c, &b->key, 0));
+}
+
+void bch_submit_bbio(struct bio *bio, struct cache_set *c,
+                    struct bkey *k, unsigned ptr)
+{
+       struct bbio *b = container_of(bio, struct bbio, bio);
+       bch_bkey_copy_single_ptr(&b->key, k, ptr);
+       __bch_submit_bbio(bio, c);
+}
+
+/* IO errors */
+
+void bch_count_io_errors(struct cache *ca, int error, const char *m)
+{
+       /*
+        * The halflife of an error is:
+        * log2(1/2)/log2(127/128) * refresh ~= 88 * refresh
+        */
+
+       if (ca->set->error_decay) {
+               unsigned count = atomic_inc_return(&ca->io_count);
+
+               while (count > ca->set->error_decay) {
+                       unsigned errors;
+                       unsigned old = count;
+                       unsigned new = count - ca->set->error_decay;
+
+                       /*
+                        * First we subtract refresh from count; each time we
+                        * succesfully do so, we rescale the errors once:
+                        */
+
+                       count = atomic_cmpxchg(&ca->io_count, old, new);
+
+                       if (count == old) {
+                               count = new;
+
+                               errors = atomic_read(&ca->io_errors);
+                               do {
+                                       old = errors;
+                                       new = ((uint64_t) errors * 127) / 128;
+                                       errors = atomic_cmpxchg(&ca->io_errors,
+                                                               old, new);
+                               } while (old != errors);
+                       }
+               }
+       }
+
+       if (error) {
+               char buf[BDEVNAME_SIZE];
+               unsigned errors = atomic_add_return(1 << IO_ERROR_SHIFT,
+                                                   &ca->io_errors);
+               errors >>= IO_ERROR_SHIFT;
+
+               if (errors < ca->set->error_limit)
+                       pr_err("%s: IO error on %s, recovering",
+                              bdevname(ca->bdev, buf), m);
+               else
+                       bch_cache_set_error(ca->set,
+                                           "%s: too many IO errors %s",
+                                           bdevname(ca->bdev, buf), m);
+       }
+}
+
+void bch_bbio_count_io_errors(struct cache_set *c, struct bio *bio,
+                             int error, const char *m)
+{
+       struct bbio *b = container_of(bio, struct bbio, bio);
+       struct cache *ca = PTR_CACHE(c, &b->key, 0);
+
+       unsigned threshold = bio->bi_rw & REQ_WRITE
+               ? c->congested_write_threshold_us
+               : c->congested_read_threshold_us;
+
+       if (threshold) {
+               unsigned t = local_clock_us();
+
+               int us = t - b->submit_time_us;
+               int congested = atomic_read(&c->congested);
+
+               if (us > (int) threshold) {
+                       int ms = us / 1024;
+                       c->congested_last_us = t;
+
+                       ms = min(ms, CONGESTED_MAX + congested);
+                       atomic_sub(ms, &c->congested);
+               } else if (congested < 0)
+                       atomic_inc(&c->congested);
+       }
+
+       bch_count_io_errors(ca, error, m);
+}
+
+void bch_bbio_endio(struct cache_set *c, struct bio *bio,
+                   int error, const char *m)
+{
+       struct closure *cl = bio->bi_private;
+
+       bch_bbio_count_io_errors(c, bio, error, m);
+       bio_put(bio);
+       closure_put(cl);
+}
diff --git a/drivers/md/bcache/journal.c b/drivers/md/bcache/journal.c

new file mode 100644 (file)

index 0000000..8c8dfdc
--- /dev/null
+++ b/drivers/md/bcache/journal.c
@@ -0,0 +1,787 @@
+/*
+ * bcache journalling code, for btree insertions
+ *
+ * Copyright 2012 Google, Inc.
+ */
+
+#include "bcache.h"
+#include "btree.h"
+#include "debug.h"
+#include "request.h"
+
+/*
+ * Journal replay/recovery:
+ *
+ * This code is all driven from run_cache_set(); we first read the journal
+ * entries, do some other stuff, then we mark all the keys in the journal
+ * entries (same as garbage collection would), then we replay them - reinserting
+ * them into the cache in precisely the same order as they appear in the
+ * journal.
+ *
+ * We only journal keys that go in leaf nodes, which simplifies things quite a
+ * bit.
+ */
+
+static void journal_read_endio(struct bio *bio, int error)
+{
+       struct closure *cl = bio->bi_private;
+       closure_put(cl);
+}
+
+static int journal_read_bucket(struct cache *ca, struct list_head *list,
+                              struct btree_op *op, unsigned bucket_index)
+{
+       struct journal_device *ja = &ca->journal;
+       struct bio *bio = &ja->bio;
+
+       struct journal_replay *i;
+       struct jset *j, *data = ca->set->journal.w[0].data;
+       unsigned len, left, offset = 0;
+       int ret = 0;
+       sector_t bucket = bucket_to_sector(ca->set, ca->sb.d[bucket_index]);
+
+       pr_debug("reading %llu", (uint64_t) bucket);
+
+       while (offset < ca->sb.bucket_size) {
+reread:                left = ca->sb.bucket_size - offset;
+               len = min_t(unsigned, left, PAGE_SECTORS * 8);
+
+               bio_reset(bio);
+               bio->bi_sector  = bucket + offset;
+               bio->bi_bdev    = ca->bdev;
+               bio->bi_rw      = READ;
+               bio->bi_size    = len << 9;
+
+               bio->bi_end_io  = journal_read_endio;
+               bio->bi_private = &op->cl;
+               bch_bio_map(bio, data);
+
+               closure_bio_submit(bio, &op->cl, ca);
+               closure_sync(&op->cl);
+
+               /* This function could be simpler now since we no longer write
+                * journal entries that overlap bucket boundaries; this means
+                * the start of a bucket will always have a valid journal entry
+                * if it has any journal entries at all.
+                */
+
+               j = data;
+               while (len) {
+                       struct list_head *where;
+                       size_t blocks, bytes = set_bytes(j);
+
+                       if (j->magic != jset_magic(ca->set))
+                               return ret;
+
+                       if (bytes > left << 9)
+                               return ret;
+
+                       if (bytes > len << 9)
+                               goto reread;
+
+                       if (j->csum != csum_set(j))
+                               return ret;
+
+                       blocks = set_blocks(j, ca->set);
+
+                       while (!list_empty(list)) {
+                               i = list_first_entry(list,
+                                       struct journal_replay, list);
+                               if (i->j.seq >= j->last_seq)
+                                       break;
+                               list_del(&i->list);
+                               kfree(i);
+                       }
+
+                       list_for_each_entry_reverse(i, list, list) {
+                               if (j->seq == i->j.seq)
+                                       goto next_set;
+
+                               if (j->seq < i->j.last_seq)
+                                       goto next_set;
+
+                               if (j->seq > i->j.seq) {
+                                       where = &i->list;
+                                       goto add;
+                               }
+                       }
+
+                       where = list;
+add:
+                       i = kmalloc(offsetof(struct journal_replay, j) +
+                                   bytes, GFP_KERNEL);
+                       if (!i)
+                               return -ENOMEM;
+                       memcpy(&i->j, j, bytes);
+                       list_add(&i->list, where);
+                       ret = 1;
+
+                       ja->seq[bucket_index] = j->seq;
+next_set:
+                       offset  += blocks * ca->sb.block_size;
+                       len     -= blocks * ca->sb.block_size;
+                       j = ((void *) j) + blocks * block_bytes(ca);
+               }
+       }
+
+       return ret;
+}
+
+int bch_journal_read(struct cache_set *c, struct list_head *list,
+                       struct btree_op *op)
+{
+#define read_bucket(b)                                                 \
+       ({                                                              \
+               int ret = journal_read_bucket(ca, list, op, b);         \
+               __set_bit(b, bitmap);                                   \
+               if (ret < 0)                                            \
+                       return ret;                                     \
+               ret;                                                    \
+       })
+
+       struct cache *ca;
+       unsigned iter;
+
+       for_each_cache(ca, c, iter) {
+               struct journal_device *ja = &ca->journal;
+               unsigned long bitmap[SB_JOURNAL_BUCKETS / BITS_PER_LONG];
+               unsigned i, l, r, m;
+               uint64_t seq;
+
+               bitmap_zero(bitmap, SB_JOURNAL_BUCKETS);
+               pr_debug("%u journal buckets", ca->sb.njournal_buckets);
+
+               /* Read journal buckets ordered by golden ratio hash to quickly
+                * find a sequence of buckets with valid journal entries
+                */
+               for (i = 0; i < ca->sb.njournal_buckets; i++) {
+                       l = (i * 2654435769U) % ca->sb.njournal_buckets;
+
+                       if (test_bit(l, bitmap))
+                               break;
+
+                       if (read_bucket(l))
+                               goto bsearch;
+               }
+
+               /* If that fails, check all the buckets we haven't checked
+                * already
+                */
+               pr_debug("falling back to linear search");
+
+               for (l = 0; l < ca->sb.njournal_buckets; l++) {
+                       if (test_bit(l, bitmap))
+                               continue;
+
+                       if (read_bucket(l))
+                               goto bsearch;
+               }
+bsearch:
+               /* Binary search */
+               m = r = find_next_bit(bitmap, ca->sb.njournal_buckets, l + 1);
+               pr_debug("starting binary search, l %u r %u", l, r);
+
+               while (l + 1 < r) {
+                       m = (l + r) >> 1;
+
+                       if (read_bucket(m))
+                               l = m;
+                       else
+                               r = m;
+               }
+
+               /* Read buckets in reverse order until we stop finding more
+                * journal entries
+                */
+               pr_debug("finishing up");
+               l = m;
+
+               while (1) {
+                       if (!l--)
+                               l = ca->sb.njournal_buckets - 1;
+
+                       if (l == m)
+                               break;
+
+                       if (test_bit(l, bitmap))
+                               continue;
+
+                       if (!read_bucket(l))
+                               break;
+               }
+
+               seq = 0;
+
+               for (i = 0; i < ca->sb.njournal_buckets; i++)
+                       if (ja->seq[i] > seq) {
+                               seq = ja->seq[i];
+                               ja->cur_idx = ja->discard_idx =
+                                       ja->last_idx = i;
+
+                       }
+       }
+
+       c->journal.seq = list_entry(list->prev,
+                                   struct journal_replay,
+                                   list)->j.seq;
+
+       return 0;
+#undef read_bucket
+}
+
+void bch_journal_mark(struct cache_set *c, struct list_head *list)
+{
+       atomic_t p = { 0 };
+       struct bkey *k;
+       struct journal_replay *i;
+       struct journal *j = &c->journal;
+       uint64_t last = j->seq;
+
+       /*
+        * journal.pin should never fill up - we never write a journal
+        * entry when it would fill up. But if for some reason it does, we
+        * iterate over the list in reverse order so that we can just skip that
+        * refcount instead of bugging.
+        */
+
+       list_for_each_entry_reverse(i, list, list) {
+               BUG_ON(last < i->j.seq);
+               i->pin = NULL;
+
+               while (last-- != i->j.seq)
+                       if (fifo_free(&j->pin) > 1) {
+                               fifo_push_front(&j->pin, p);
+                               atomic_set(&fifo_front(&j->pin), 0);
+                       }
+
+               if (fifo_free(&j->pin) > 1) {
+                       fifo_push_front(&j->pin, p);
+                       i->pin = &fifo_front(&j->pin);
+                       atomic_set(i->pin, 1);
+               }
+
+               for (k = i->j.start;
+                    k < end(&i->j);
+                    k = bkey_next(k)) {
+                       unsigned j;
+
+                       for (j = 0; j < KEY_PTRS(k); j++) {
+                               struct bucket *g = PTR_BUCKET(c, k, j);
+                               atomic_inc(&g->pin);
+
+                               if (g->prio == BTREE_PRIO &&
+                                   !ptr_stale(c, k, j))
+                                       g->prio = INITIAL_PRIO;
+                       }
+
+                       __bch_btree_mark_key(c, 0, k);
+               }
+       }
+}
+
+int bch_journal_replay(struct cache_set *s, struct list_head *list,
+                         struct btree_op *op)
+{
+       int ret = 0, keys = 0, entries = 0;
+       struct bkey *k;
+       struct journal_replay *i =
+               list_entry(list->prev, struct journal_replay, list);
+
+       uint64_t start = i->j.last_seq, end = i->j.seq, n = start;
+
+       list_for_each_entry(i, list, list) {
+               BUG_ON(i->pin && atomic_read(i->pin) != 1);
+
+               if (n != i->j.seq)
+                       pr_err(
+               "journal entries %llu-%llu missing! (replaying %llu-%llu)\n",
+               n, i->j.seq - 1, start, end);
+
+               for (k = i->j.start;
+                    k < end(&i->j);
+                    k = bkey_next(k)) {
+                       pr_debug("%s", pkey(k));
+                       bkey_copy(op->keys.top, k);
+                       bch_keylist_push(&op->keys);
+
+                       op->journal = i->pin;
+                       atomic_inc(op->journal);
+
+                       ret = bch_btree_insert(op, s);
+                       if (ret)
+                               goto err;
+
+                       BUG_ON(!bch_keylist_empty(&op->keys));
+                       keys++;
+
+                       cond_resched();
+               }
+
+               if (i->pin)
+                       atomic_dec(i->pin);
+               n = i->j.seq + 1;
+               entries++;
+       }
+
+       pr_info("journal replay done, %i keys in %i entries, seq %llu",
+               keys, entries, end);
+
+       while (!list_empty(list)) {
+               i = list_first_entry(list, struct journal_replay, list);
+               list_del(&i->list);
+               kfree(i);
+       }
+err:
+       closure_sync(&op->cl);
+       return ret;
+}
+
+/* Journalling */
+
+static void btree_flush_write(struct cache_set *c)
+{
+       /*
+        * Try to find the btree node with that references the oldest journal
+        * entry, best is our current candidate and is locked if non NULL:
+        */
+       struct btree *b, *best = NULL;
+       unsigned iter;
+
+       for_each_cached_btree(b, c, iter) {
+               if (!down_write_trylock(&b->lock))
+                       continue;
+
+               if (!btree_node_dirty(b) ||
+                   !btree_current_write(b)->journal) {
+                       rw_unlock(true, b);
+                       continue;
+               }
+
+               if (!best)
+                       best = b;
+               else if (journal_pin_cmp(c,
+                                        btree_current_write(best),
+                                        btree_current_write(b))) {
+                       rw_unlock(true, best);
+                       best = b;
+               } else
+                       rw_unlock(true, b);
+       }
+
+       if (best)
+               goto out;
+
+       /* We can't find the best btree node, just pick the first */
+       list_for_each_entry(b, &c->btree_cache, list)
+               if (!b->level && btree_node_dirty(b)) {
+                       best = b;
+                       rw_lock(true, best, best->level);
+                       goto found;
+               }
+
+out:
+       if (!best)
+               return;
+found:
+       if (btree_node_dirty(best))
+               bch_btree_write(best, true, NULL);
+       rw_unlock(true, best);
+}
+
+#define last_seq(j)    ((j)->seq - fifo_used(&(j)->pin) + 1)
+
+static void journal_discard_endio(struct bio *bio, int error)
+{
+       struct journal_device *ja =
+               container_of(bio, struct journal_device, discard_bio);
+       struct cache *ca = container_of(ja, struct cache, journal);
+
+       atomic_set(&ja->discard_in_flight, DISCARD_DONE);
+
+       closure_wake_up(&ca->set->journal.wait);
+       closure_put(&ca->set->cl);
+}
+
+static void journal_discard_work(struct work_struct *work)
+{
+       struct journal_device *ja =
+               container_of(work, struct journal_device, discard_work);
+
+       submit_bio(0, &ja->discard_bio);
+}
+
+static void do_journal_discard(struct cache *ca)
+{
+       struct journal_device *ja = &ca->journal;
+       struct bio *bio = &ja->discard_bio;
+
+       if (!ca->discard) {
+               ja->discard_idx = ja->last_idx;
+               return;
+       }
+
+       switch (atomic_read(&ja->discard_in_flight) == DISCARD_IN_FLIGHT) {
+       case DISCARD_IN_FLIGHT:
+               return;
+
+       case DISCARD_DONE:
+               ja->discard_idx = (ja->discard_idx + 1) %
+                       ca->sb.njournal_buckets;
+
+               atomic_set(&ja->discard_in_flight, DISCARD_READY);
+               /* fallthrough */
+
+       case DISCARD_READY:
+               if (ja->discard_idx == ja->last_idx)
+                       return;
+
+               atomic_set(&ja->discard_in_flight, DISCARD_IN_FLIGHT);
+
+               bio_init(bio);
+               bio->bi_sector          = bucket_to_sector(ca->set,
+                                               ca->sb.d[ja->discard_idx]);
+               bio->bi_bdev            = ca->bdev;
+               bio->bi_rw              = REQ_WRITE|REQ_DISCARD;
+               bio->bi_max_vecs        = 1;
+               bio->bi_io_vec          = bio->bi_inline_vecs;
+               bio->bi_size            = bucket_bytes(ca);
+               bio->bi_end_io          = journal_discard_endio;
+
+               closure_get(&ca->set->cl);
+               INIT_WORK(&ja->discard_work, journal_discard_work);
+               schedule_work(&ja->discard_work);
+       }
+}
+
+static void journal_reclaim(struct cache_set *c)
+{
+       struct bkey *k = &c->journal.key;
+       struct cache *ca;
+       uint64_t last_seq;
+       unsigned iter, n = 0;
+       atomic_t p;
+
+       while (!atomic_read(&fifo_front(&c->journal.pin)))
+               fifo_pop(&c->journal.pin, p);
+
+       last_seq = last_seq(&c->journal);
+
+       /* Update last_idx */
+
+       for_each_cache(ca, c, iter) {
+               struct journal_device *ja = &ca->journal;
+
+               while (ja->last_idx != ja->cur_idx &&
+                      ja->seq[ja->last_idx] < last_seq)
+                       ja->last_idx = (ja->last_idx + 1) %
+                               ca->sb.njournal_buckets;
+       }
+
+       for_each_cache(ca, c, iter)
+               do_journal_discard(ca);
+
+       if (c->journal.blocks_free)
+               return;
+
+       /*
+        * Allocate:
+        * XXX: Sort by free journal space
+        */
+
+       for_each_cache(ca, c, iter) {
+               struct journal_device *ja = &ca->journal;
+               unsigned next = (ja->cur_idx + 1) % ca->sb.njournal_buckets;
+
+               /* No space available on this device */
+               if (next == ja->discard_idx)
+                       continue;
+
+               ja->cur_idx = next;
+               k->ptr[n++] = PTR(0,
+                                 bucket_to_sector(c, ca->sb.d[ja->cur_idx]),
+                                 ca->sb.nr_this_dev);
+       }
+
+       bkey_init(k);
+       SET_KEY_PTRS(k, n);
+
+       if (n)
+               c->journal.blocks_free = c->sb.bucket_size >> c->block_bits;
+
+       if (!journal_full(&c->journal))
+               __closure_wake_up(&c->journal.wait);
+}
+
+void bch_journal_next(struct journal *j)
+{
+       atomic_t p = { 1 };
+
+       j->cur = (j->cur == j->w)
+               ? &j->w[1]
+               : &j->w[0];
+
+       /*
+        * The fifo_push() needs to happen at the same time as j->seq is
+        * incremented for last_seq() to be calculated correctly
+        */
+       BUG_ON(!fifo_push(&j->pin, p));
+       atomic_set(&fifo_back(&j->pin), 1);
+
+       j->cur->data->seq       = ++j->seq;
+       j->cur->need_write      = false;
+       j->cur->data->keys      = 0;
+
+       if (fifo_full(&j->pin))
+               pr_debug("journal_pin full (%zu)", fifo_used(&j->pin));
+}
+
+static void journal_write_endio(struct bio *bio, int error)
+{
+       struct journal_write *w = bio->bi_private;
+
+       cache_set_err_on(error, w->c, "journal io error");
+       closure_put(&w->c->journal.io.cl);
+}
+
+static void journal_write(struct closure *);
+
+static void journal_write_done(struct closure *cl)
+{
+       struct journal *j = container_of(cl, struct journal, io.cl);
+       struct cache_set *c = container_of(j, struct cache_set, journal);
+
+       struct journal_write *w = (j->cur == j->w)
+               ? &j->w[1]
+               : &j->w[0];
+
+       __closure_wake_up(&w->wait);
+
+       if (c->journal_delay_ms)
+               closure_delay(&j->io, msecs_to_jiffies(c->journal_delay_ms));
+
+       continue_at(cl, journal_write, system_wq);
+}
+
+static void journal_write_unlocked(struct closure *cl)
+       __releases(c->journal.lock)
+{
+       struct cache_set *c = container_of(cl, struct cache_set, journal.io.cl);
+       struct cache *ca;
+       struct journal_write *w = c->journal.cur;
+       struct bkey *k = &c->journal.key;
+       unsigned i, sectors = set_blocks(w->data, c) * c->sb.block_size;
+
+       struct bio *bio;
+       struct bio_list list;
+       bio_list_init(&list);
+
+       if (!w->need_write) {
+               /*
+                * XXX: have to unlock closure before we unlock journal lock,
+                * else we race with bch_journal(). But this way we race
+                * against cache set unregister. Doh.
+                */
+               set_closure_fn(cl, NULL, NULL);
+               closure_sub(cl, CLOSURE_RUNNING + 1);
+               spin_unlock(&c->journal.lock);
+               return;
+       } else if (journal_full(&c->journal)) {
+               journal_reclaim(c);
+               spin_unlock(&c->journal.lock);
+
+               btree_flush_write(c);
+               continue_at(cl, journal_write, system_wq);
+       }
+
+       c->journal.blocks_free -= set_blocks(w->data, c);
+
+       w->data->btree_level = c->root->level;
+
+       bkey_copy(&w->data->btree_root, &c->root->key);
+       bkey_copy(&w->data->uuid_bucket, &c->uuid_bucket);
+
+       for_each_cache(ca, c, i)
+               w->data->prio_bucket[ca->sb.nr_this_dev] = ca->prio_buckets[0];
+
+       w->data->magic          = jset_magic(c);
+       w->data->version        = BCACHE_JSET_VERSION;
+       w->data->last_seq       = last_seq(&c->journal);
+       w->data->csum           = csum_set(w->data);
+
+       for (i = 0; i < KEY_PTRS(k); i++) {
+               ca = PTR_CACHE(c, k, i);
+               bio = &ca->journal.bio;
+
+               atomic_long_add(sectors, &ca->meta_sectors_written);
+
+               bio_reset(bio);
+               bio->bi_sector  = PTR_OFFSET(k, i);
+               bio->bi_bdev    = ca->bdev;
+               bio->bi_rw      = REQ_WRITE|REQ_SYNC|REQ_META|REQ_FLUSH;
+               bio->bi_size    = sectors << 9;
+
+               bio->bi_end_io  = journal_write_endio;
+               bio->bi_private = w;
+               bch_bio_map(bio, w->data);
+
+               trace_bcache_journal_write(bio);
+               bio_list_add(&list, bio);
+
+               SET_PTR_OFFSET(k, i, PTR_OFFSET(k, i) + sectors);
+
+               ca->journal.seq[ca->journal.cur_idx] = w->data->seq;
+       }
+
+       atomic_dec_bug(&fifo_back(&c->journal.pin));
+       bch_journal_next(&c->journal);
+       journal_reclaim(c);
+
+       spin_unlock(&c->journal.lock);
+
+       while ((bio = bio_list_pop(&list)))
+               closure_bio_submit(bio, cl, c->cache[0]);
+
+       continue_at(cl, journal_write_done, NULL);
+}
+
+static void journal_write(struct closure *cl)
+{
+       struct cache_set *c = container_of(cl, struct cache_set, journal.io.cl);
+
+       spin_lock(&c->journal.lock);
+       journal_write_unlocked(cl);
+}
+
+static void __journal_try_write(struct cache_set *c, bool noflush)
+       __releases(c->journal.lock)
+{
+       struct closure *cl = &c->journal.io.cl;
+
+       if (!closure_trylock(cl, &c->cl))
+               spin_unlock(&c->journal.lock);
+       else if (noflush && journal_full(&c->journal)) {
+               spin_unlock(&c->journal.lock);
+               continue_at(cl, journal_write, system_wq);
+       } else
+               journal_write_unlocked(cl);
+}
+
+#define journal_try_write(c)   __journal_try_write(c, false)
+
+void bch_journal_meta(struct cache_set *c, struct closure *cl)
+{
+       struct journal_write *w;
+
+       if (CACHE_SYNC(&c->sb)) {
+               spin_lock(&c->journal.lock);
+
+               w = c->journal.cur;
+               w->need_write = true;
+
+               if (cl)
+                       BUG_ON(!closure_wait(&w->wait, cl));
+
+               __journal_try_write(c, true);
+       }
+}
+
+/*
+ * Entry point to the journalling code - bio_insert() and btree_invalidate()
+ * pass bch_journal() a list of keys to be journalled, and then
+ * bch_journal() hands those same keys off to btree_insert_async()
+ */
+
+void bch_journal(struct closure *cl)
+{
+       struct btree_op *op = container_of(cl, struct btree_op, cl);
+       struct cache_set *c = op->c;
+       struct journal_write *w;
+       size_t b, n = ((uint64_t *) op->keys.top) - op->keys.list;
+
+       if (op->type != BTREE_INSERT ||
+           !CACHE_SYNC(&c->sb))
+               goto out;
+
+       /*
+        * If we're looping because we errored, might already be waiting on
+        * another journal write:
+        */
+       while (atomic_read(&cl->parent->remaining) & CLOSURE_WAITING)
+               closure_sync(cl->parent);
+
+       spin_lock(&c->journal.lock);
+
+       if (journal_full(&c->journal)) {
+               /* XXX: tracepoint */
+               closure_wait(&c->journal.wait, cl);
+
+               journal_reclaim(c);
+               spin_unlock(&c->journal.lock);
+
+               btree_flush_write(c);
+               continue_at(cl, bch_journal, bcache_wq);
+       }
+
+       w = c->journal.cur;
+       w->need_write = true;
+       b = __set_blocks(w->data, w->data->keys + n, c);
+
+       if (b * c->sb.block_size > PAGE_SECTORS << JSET_BITS ||
+           b > c->journal.blocks_free) {
+               /* XXX: If we were inserting so many keys that they won't fit in
+                * an _empty_ journal write, we'll deadlock. For now, handle
+                * this in bch_keylist_realloc() - but something to think about.
+                */
+               BUG_ON(!w->data->keys);
+
+               /* XXX: tracepoint */
+               BUG_ON(!closure_wait(&w->wait, cl));
+
+               closure_flush(&c->journal.io);
+
+               journal_try_write(c);
+               continue_at(cl, bch_journal, bcache_wq);
+       }
+
+       memcpy(end(w->data), op->keys.list, n * sizeof(uint64_t));
+       w->data->keys += n;
+
+       op->journal = &fifo_back(&c->journal.pin);
+       atomic_inc(op->journal);
+
+       if (op->flush_journal) {
+               closure_flush(&c->journal.io);
+               closure_wait(&w->wait, cl->parent);
+       }
+
+       journal_try_write(c);
+out:
+       bch_btree_insert_async(cl);
+}
+
+void bch_journal_free(struct cache_set *c)
+{
+       free_pages((unsigned long) c->journal.w[1].data, JSET_BITS);
+       free_pages((unsigned long) c->journal.w[0].data, JSET_BITS);
+       free_fifo(&c->journal.pin);
+}
+
+int bch_journal_alloc(struct cache_set *c)
+{
+       struct journal *j = &c->journal;
+
+       closure_init_unlocked(&j->io);
+       spin_lock_init(&j->lock);
+
+       c->journal_delay_ms = 100;
+
+       j->w[0].c = c;
+       j->w[1].c = c;
+
+       if (!(init_fifo(&j->pin, JOURNAL_PIN, GFP_KERNEL)) ||
+           !(j->w[0].data = (void *) __get_free_pages(GFP_KERNEL, JSET_BITS)) ||
+           !(j->w[1].data = (void *) __get_free_pages(GFP_KERNEL, JSET_BITS)))
+               return -ENOMEM;
+
+       return 0;
+}
diff --git a/drivers/md/bcache/journal.h b/drivers/md/bcache/journal.h

new file mode 100644 (file)

index 0000000..3d78512
--- /dev/null
+++ b/drivers/md/bcache/journal.h
@@ -0,0 +1,215 @@
+#ifndef _BCACHE_JOURNAL_H
+#define _BCACHE_JOURNAL_H
+
+/*
+ * THE JOURNAL:
+ *
+ * The journal is treated as a circular buffer of buckets - a journal entry
+ * never spans two buckets. This means (not implemented yet) we can resize the
+ * journal at runtime, and will be needed for bcache on raw flash support.
+ *
+ * Journal entries contain a list of keys, ordered by the time they were
+ * inserted; thus journal replay just has to reinsert the keys.
+ *
+ * We also keep some things in the journal header that are logically part of the
+ * superblock - all the things that are frequently updated. This is for future
+ * bcache on raw flash support; the superblock (which will become another
+ * journal) can't be moved or wear leveled, so it contains just enough
+ * information to find the main journal, and the superblock only has to be
+ * rewritten when we want to move/wear level the main journal.
+ *
+ * Currently, we don't journal BTREE_REPLACE operations - this will hopefully be
+ * fixed eventually. This isn't a bug - BTREE_REPLACE is used for insertions
+ * from cache misses, which don't have to be journaled, and for writeback and
+ * moving gc we work around it by flushing the btree to disk before updating the
+ * gc information. But it is a potential issue with incremental garbage
+ * collection, and it's fragile.
+ *
+ * OPEN JOURNAL ENTRIES:
+ *
+ * Each journal entry contains, in the header, the sequence number of the last
+ * journal entry still open - i.e. that has keys that haven't been flushed to
+ * disk in the btree.
+ *
+ * We track this by maintaining a refcount for every open journal entry, in a
+ * fifo; each entry in the fifo corresponds to a particular journal
+ * entry/sequence number. When the refcount at the tail of the fifo goes to
+ * zero, we pop it off - thus, the size of the fifo tells us the number of open
+ * journal entries
+ *
+ * We take a refcount on a journal entry when we add some keys to a journal
+ * entry that we're going to insert (held by struct btree_op), and then when we
+ * insert those keys into the btree the btree write we're setting up takes a
+ * copy of that refcount (held by struct btree_write). That refcount is dropped
+ * when the btree write completes.
+ *
+ * A struct btree_write can only hold a refcount on a single journal entry, but
+ * might contain keys for many journal entries - we handle this by making sure
+ * it always has a refcount on the _oldest_ journal entry of all the journal
+ * entries it has keys for.
+ *
+ * JOURNAL RECLAIM:
+ *
+ * As mentioned previously, our fifo of refcounts tells us the number of open
+ * journal entries; from that and the current journal sequence number we compute
+ * last_seq - the oldest journal entry we still need. We write last_seq in each
+ * journal entry, and we also have to keep track of where it exists on disk so
+ * we don't overwrite it when we loop around the journal.
+ *
+ * To do that we track, for each journal bucket, the sequence number of the
+ * newest journal entry it contains - if we don't need that journal entry we
+ * don't need anything in that bucket anymore. From that we track the last
+ * journal bucket we still need; all this is tracked in struct journal_device
+ * and updated by journal_reclaim().
+ *
+ * JOURNAL FILLING UP:
+ *
+ * There are two ways the journal could fill up; either we could run out of
+ * space to write to, or we could have too many open journal entries and run out
+ * of room in the fifo of refcounts. Since those refcounts are decremented
+ * without any locking we can't safely resize that fifo, so we handle it the
+ * same way.
+ *
+ * If the journal fills up, we start flushing dirty btree nodes until we can
+ * allocate space for a journal write again - preferentially flushing btree
+ * nodes that are pinning the oldest journal entries first.
+ */
+
+#define BCACHE_JSET_VERSION_UUIDv1     1
+/* Always latest UUID format */
+#define BCACHE_JSET_VERSION_UUID       1
+#define BCACHE_JSET_VERSION            1
+
+/*
+ * On disk format for a journal entry:
+ * seq is monotonically increasing; every journal entry has its own unique
+ * sequence number.
+ *
+ * last_seq is the oldest journal entry that still has keys the btree hasn't
+ * flushed to disk yet.
+ *
+ * version is for on disk format changes.
+ */
+struct jset {
+       uint64_t                csum;
+       uint64_t                magic;
+       uint64_t                seq;
+       uint32_t                version;
+       uint32_t                keys;
+
+       uint64_t                last_seq;
+
+       BKEY_PADDED(uuid_bucket);
+       BKEY_PADDED(btree_root);
+       uint16_t                btree_level;
+       uint16_t                pad[3];
+
+       uint64_t                prio_bucket[MAX_CACHES_PER_SET];
+
+       union {
+               struct bkey     start[0];
+               uint64_t        d[0];
+       };
+};
+
+/*
+ * Only used for holding the journal entries we read in btree_journal_read()
+ * during cache_registration
+ */
+struct journal_replay {
+       struct list_head        list;
+       atomic_t                *pin;
+       struct jset             j;
+};
+
+/*
+ * We put two of these in struct journal; we used them for writes to the
+ * journal that are being staged or in flight.
+ */
+struct journal_write {
+       struct jset             *data;
+#define JSET_BITS              3
+
+       struct cache_set        *c;
+       struct closure_waitlist wait;
+       bool                    need_write;
+};
+
+/* Embedded in struct cache_set */
+struct journal {
+       spinlock_t              lock;
+       /* used when waiting because the journal was full */
+       struct closure_waitlist wait;
+       struct closure_with_timer io;
+
+       /* Number of blocks free in the bucket(s) we're currently writing to */
+       unsigned                blocks_free;
+       uint64_t                seq;
+       DECLARE_FIFO(atomic_t, pin);
+
+       BKEY_PADDED(key);
+
+       struct journal_write    w[2], *cur;
+};
+
+/*
+ * Embedded in struct cache. First three fields refer to the array of journal
+ * buckets, in cache_sb.
+ */
+struct journal_device {
+       /*
+        * For each journal bucket, contains the max sequence number of the
+        * journal writes it contains - so we know when a bucket can be reused.
+        */
+       uint64_t                seq[SB_JOURNAL_BUCKETS];
+
+       /* Journal bucket we're currently writing to */
+       unsigned                cur_idx;
+
+       /* Last journal bucket that still contains an open journal entry */
+       unsigned                last_idx;
+
+       /* Next journal bucket to be discarded */
+       unsigned                discard_idx;
+
+#define DISCARD_READY          0
+#define DISCARD_IN_FLIGHT      1
+#define DISCARD_DONE           2
+       /* 1 - discard in flight, -1 - discard completed */
+       atomic_t                discard_in_flight;
+
+       struct work_struct      discard_work;
+       struct bio              discard_bio;
+       struct bio_vec          discard_bv;
+
+       /* Bio for journal reads/writes to this device */
+       struct bio              bio;
+       struct bio_vec          bv[8];
+};
+
+#define journal_pin_cmp(c, l, r)                               \
+       (fifo_idx(&(c)->journal.pin, (l)->journal) >            \
+        fifo_idx(&(c)->journal.pin, (r)->journal))
+
+#define JOURNAL_PIN    20000
+
+#define journal_full(j)                                                \
+       (!(j)->blocks_free || fifo_free(&(j)->pin) <= 1)
+
+struct closure;
+struct cache_set;
+struct btree_op;
+
+void bch_journal(struct closure *);
+void bch_journal_next(struct journal *);
+void bch_journal_mark(struct cache_set *, struct list_head *);
+void bch_journal_meta(struct cache_set *, struct closure *);
+int bch_journal_read(struct cache_set *, struct list_head *,
+                       struct btree_op *);
+int bch_journal_replay(struct cache_set *, struct list_head *,
+                         struct btree_op *);
+
+void bch_journal_free(struct cache_set *);
+int bch_journal_alloc(struct cache_set *);
+
+#endif /* _BCACHE_JOURNAL_H */
diff --git a/drivers/md/bcache/movinggc.c b/drivers/md/bcache/movinggc.c

new file mode 100644 (file)

index 0000000..8589512
--- /dev/null
+++ b/drivers/md/bcache/movinggc.c
@@ -0,0 +1,254 @@
+/*
+ * Moving/copying garbage collector
+ *
+ * Copyright 2012 Google, Inc.
+ */
+
+#include "bcache.h"
+#include "btree.h"
+#include "debug.h"
+#include "request.h"
+
+struct moving_io {
+       struct keybuf_key       *w;
+       struct search           s;
+       struct bbio             bio;
+};
+
+static bool moving_pred(struct keybuf *buf, struct bkey *k)
+{
+       struct cache_set *c = container_of(buf, struct cache_set,
+                                          moving_gc_keys);
+       unsigned i;
+
+       for (i = 0; i < KEY_PTRS(k); i++) {
+               struct cache *ca = PTR_CACHE(c, k, i);
+               struct bucket *g = PTR_BUCKET(c, k, i);
+
+               if (GC_SECTORS_USED(g) < ca->gc_move_threshold)
+                       return true;
+       }
+
+       return false;
+}
+
+/* Moving GC - IO loop */
+
+static void moving_io_destructor(struct closure *cl)
+{
+       struct moving_io *io = container_of(cl, struct moving_io, s.cl);
+       kfree(io);
+}
+
+static void write_moving_finish(struct closure *cl)
+{
+       struct moving_io *io = container_of(cl, struct moving_io, s.cl);
+       struct bio *bio = &io->bio.bio;
+       struct bio_vec *bv = bio_iovec_idx(bio, bio->bi_vcnt);
+
+       while (bv-- != bio->bi_io_vec)
+               __free_page(bv->bv_page);
+
+       pr_debug("%s %s", io->s.op.insert_collision
+                ? "collision moving" : "moved",
+                pkey(&io->w->key));
+
+       bch_keybuf_del(&io->s.op.c->moving_gc_keys, io->w);
+
+       atomic_dec_bug(&io->s.op.c->in_flight);
+       closure_wake_up(&io->s.op.c->moving_gc_wait);
+
+       closure_return_with_destructor(cl, moving_io_destructor);
+}
+
+static void read_moving_endio(struct bio *bio, int error)
+{
+       struct moving_io *io = container_of(bio->bi_private,
+                                           struct moving_io, s.cl);
+
+       if (error)
+               io->s.error = error;
+
+       bch_bbio_endio(io->s.op.c, bio, error, "reading data to move");
+}
+
+static void moving_init(struct moving_io *io)
+{
+       struct bio *bio = &io->bio.bio;
+
+       bio_init(bio);
+       bio_get(bio);
+       bio_set_prio(bio, IOPRIO_PRIO_VALUE(IOPRIO_CLASS_IDLE, 0));
+
+       bio->bi_size            = KEY_SIZE(&io->w->key) << 9;
+       bio->bi_max_vecs        = DIV_ROUND_UP(KEY_SIZE(&io->w->key),
+                                              PAGE_SECTORS);
+       bio->bi_private         = &io->s.cl;
+       bio->bi_io_vec          = bio->bi_inline_vecs;
+       bch_bio_map(bio, NULL);
+}
+
+static void write_moving(struct closure *cl)
+{
+       struct search *s = container_of(cl, struct search, cl);
+       struct moving_io *io = container_of(s, struct moving_io, s);
+
+       if (!s->error) {
+               trace_bcache_write_moving(&io->bio.bio);
+
+               moving_init(io);
+
+               io->bio.bio.bi_sector   = KEY_START(&io->w->key);
+               s->op.lock              = -1;
+               s->op.write_prio        = 1;
+               s->op.cache_bio         = &io->bio.bio;
+
+               s->writeback            = KEY_DIRTY(&io->w->key);
+               s->op.csum              = KEY_CSUM(&io->w->key);
+
+               s->op.type = BTREE_REPLACE;
+               bkey_copy(&s->op.replace, &io->w->key);
+
+               closure_init(&s->op.cl, cl);
+               bch_insert_data(&s->op.cl);
+       }
+
+       continue_at(cl, write_moving_finish, NULL);
+}
+
+static void read_moving_submit(struct closure *cl)
+{
+       struct search *s = container_of(cl, struct search, cl);
+       struct moving_io *io = container_of(s, struct moving_io, s);
+       struct bio *bio = &io->bio.bio;
+
+       trace_bcache_read_moving(bio);
+       bch_submit_bbio(bio, s->op.c, &io->w->key, 0);
+
+       continue_at(cl, write_moving, bch_gc_wq);
+}
+
+static void read_moving(struct closure *cl)
+{
+       struct cache_set *c = container_of(cl, struct cache_set, moving_gc);
+       struct keybuf_key *w;
+       struct moving_io *io;
+       struct bio *bio;
+
+       /* XXX: if we error, background writeback could stall indefinitely */
+
+       while (!test_bit(CACHE_SET_STOPPING, &c->flags)) {
+               w = bch_keybuf_next_rescan(c, &c->moving_gc_keys, &MAX_KEY);
+               if (!w)
+                       break;
+
+               io = kzalloc(sizeof(struct moving_io) + sizeof(struct bio_vec)
+                            * DIV_ROUND_UP(KEY_SIZE(&w->key), PAGE_SECTORS),
+                            GFP_KERNEL);
+               if (!io)
+                       goto err;
+
+               w->private      = io;
+               io->w           = w;
+               io->s.op.inode  = KEY_INODE(&w->key);
+               io->s.op.c      = c;
+
+               moving_init(io);
+               bio = &io->bio.bio;
+
+               bio->bi_rw      = READ;
+               bio->bi_end_io  = read_moving_endio;
+
+               if (bch_bio_alloc_pages(bio, GFP_KERNEL))
+                       goto err;
+
+               pr_debug("%s", pkey(&w->key));
+
+               closure_call(&io->s.cl, read_moving_submit, NULL, &c->gc.cl);
+
+               if (atomic_inc_return(&c->in_flight) >= 64) {
+                       closure_wait_event(&c->moving_gc_wait, cl,
+                                          atomic_read(&c->in_flight) < 64);
+                       continue_at(cl, read_moving, bch_gc_wq);
+               }
+       }
+
+       if (0) {
+err:           if (!IS_ERR_OR_NULL(w->private))
+                       kfree(w->private);
+
+               bch_keybuf_del(&c->moving_gc_keys, w);
+       }
+
+       closure_return(cl);
+}
+
+static bool bucket_cmp(struct bucket *l, struct bucket *r)
+{
+       return GC_SECTORS_USED(l) < GC_SECTORS_USED(r);
+}
+
+static unsigned bucket_heap_top(struct cache *ca)
+{
+       return GC_SECTORS_USED(heap_peek(&ca->heap));
+}
+
+void bch_moving_gc(struct closure *cl)
+{
+       struct cache_set *c = container_of(cl, struct cache_set, gc.cl);
+       struct cache *ca;
+       struct bucket *b;
+       unsigned i;
+
+       if (!c->copy_gc_enabled)
+               closure_return(cl);
+
+       mutex_lock(&c->bucket_lock);
+
+       for_each_cache(ca, c, i) {
+               unsigned sectors_to_move = 0;
+               unsigned reserve_sectors = ca->sb.bucket_size *
+                       min(fifo_used(&ca->free), ca->free.size / 2);
+
+               ca->heap.used = 0;
+
+               for_each_bucket(b, ca) {
+                       if (!GC_SECTORS_USED(b))
+                               continue;
+
+                       if (!heap_full(&ca->heap)) {
+                               sectors_to_move += GC_SECTORS_USED(b);
+                               heap_add(&ca->heap, b, bucket_cmp);
+                       } else if (bucket_cmp(b, heap_peek(&ca->heap))) {
+                               sectors_to_move -= bucket_heap_top(ca);
+                               sectors_to_move += GC_SECTORS_USED(b);
+
+                               ca->heap.data[0] = b;
+                               heap_sift(&ca->heap, 0, bucket_cmp);
+                       }
+               }
+
+               while (sectors_to_move > reserve_sectors) {
+                       heap_pop(&ca->heap, b, bucket_cmp);
+                       sectors_to_move -= GC_SECTORS_USED(b);
+               }
+
+               ca->gc_move_threshold = bucket_heap_top(ca);
+
+               pr_debug("threshold %u", ca->gc_move_threshold);
+       }
+
+       mutex_unlock(&c->bucket_lock);
+
+       c->moving_gc_keys.last_scanned = ZERO_KEY;
+
+       closure_init(&c->moving_gc, cl);
+       read_moving(&c->moving_gc);
+
+       closure_return(cl);
+}
+
+void bch_moving_init_cache_set(struct cache_set *c)
+{
+       bch_keybuf_init(&c->moving_gc_keys, moving_pred);
+}
diff --git a/drivers/md/bcache/request.c b/drivers/md/bcache/request.c

new file mode 100644 (file)

index 0000000..e5ff12e
--- /dev/null
+++ b/drivers/md/bcache/request.c
@@ -0,0 +1,1411 @@
+/*
+ * Main bcache entry point - handle a read or a write request and decide what to
+ * do with it; the make_request functions are called by the block layer.
+ *
+ * Copyright 2010, 2011 Kent Overstreet <kent.overstreet@gmail.com>
+ * Copyright 2012 Google, Inc.
+ */
+
+#include "bcache.h"
+#include "btree.h"
+#include "debug.h"
+#include "request.h"
+
+#include <linux/cgroup.h>
+#include <linux/module.h>
+#include <linux/hash.h>
+#include <linux/random.h>
+#include "blk-cgroup.h"
+
+#include <trace/events/bcache.h>
+
+#define CUTOFF_CACHE_ADD       95
+#define CUTOFF_CACHE_READA     90
+#define CUTOFF_WRITEBACK       50
+#define CUTOFF_WRITEBACK_SYNC  75
+
+struct kmem_cache *bch_search_cache;
+
+static void check_should_skip(struct cached_dev *, struct search *);
+
+/* Cgroup interface */
+
+#ifdef CONFIG_CGROUP_BCACHE
+static struct bch_cgroup bcache_default_cgroup = { .cache_mode = -1 };
+
+static struct bch_cgroup *cgroup_to_bcache(struct cgroup *cgroup)
+{
+       struct cgroup_subsys_state *css;
+       return cgroup &&
+               (css = cgroup_subsys_state(cgroup, bcache_subsys_id))
+               ? container_of(css, struct bch_cgroup, css)
+               : &bcache_default_cgroup;
+}
+
+struct bch_cgroup *bch_bio_to_cgroup(struct bio *bio)
+{
+       struct cgroup_subsys_state *css = bio->bi_css
+               ? cgroup_subsys_state(bio->bi_css->cgroup, bcache_subsys_id)
+               : task_subsys_state(current, bcache_subsys_id);
+
+       return css
+               ? container_of(css, struct bch_cgroup, css)
+               : &bcache_default_cgroup;
+}
+
+static ssize_t cache_mode_read(struct cgroup *cgrp, struct cftype *cft,
+                       struct file *file,
+                       char __user *buf, size_t nbytes, loff_t *ppos)
+{
+       char tmp[1024];
+       int len = bch_snprint_string_list(tmp, PAGE_SIZE, bch_cache_modes,
+                                         cgroup_to_bcache(cgrp)->cache_mode + 1);
+
+       if (len < 0)
+               return len;
+
+       return simple_read_from_buffer(buf, nbytes, ppos, tmp, len);
+}
+
+static int cache_mode_write(struct cgroup *cgrp, struct cftype *cft,
+                           const char *buf)
+{
+       int v = bch_read_string_list(buf, bch_cache_modes);
+       if (v < 0)
+               return v;
+
+       cgroup_to_bcache(cgrp)->cache_mode = v - 1;
+       return 0;
+}
+
+static u64 bch_verify_read(struct cgroup *cgrp, struct cftype *cft)
+{
+       return cgroup_to_bcache(cgrp)->verify;
+}
+
+static int bch_verify_write(struct cgroup *cgrp, struct cftype *cft, u64 val)
+{
+       cgroup_to_bcache(cgrp)->verify = val;
+       return 0;
+}
+
+static u64 bch_cache_hits_read(struct cgroup *cgrp, struct cftype *cft)
+{
+       struct bch_cgroup *bcachecg = cgroup_to_bcache(cgrp);
+       return atomic_read(&bcachecg->stats.cache_hits);
+}
+
+static u64 bch_cache_misses_read(struct cgroup *cgrp, struct cftype *cft)
+{
+       struct bch_cgroup *bcachecg = cgroup_to_bcache(cgrp);
+       return atomic_read(&bcachecg->stats.cache_misses);
+}
+
+static u64 bch_cache_bypass_hits_read(struct cgroup *cgrp,
+                                        struct cftype *cft)
+{
+       struct bch_cgroup *bcachecg = cgroup_to_bcache(cgrp);
+       return atomic_read(&bcachecg->stats.cache_bypass_hits);
+}
+
+static u64 bch_cache_bypass_misses_read(struct cgroup *cgrp,
+                                          struct cftype *cft)
+{
+       struct bch_cgroup *bcachecg = cgroup_to_bcache(cgrp);
+       return atomic_read(&bcachecg->stats.cache_bypass_misses);
+}
+
+static struct cftype bch_files[] = {
+       {
+               .name           = "cache_mode",
+               .read           = cache_mode_read,
+               .write_string   = cache_mode_write,
+       },
+       {
+               .name           = "verify",
+               .read_u64       = bch_verify_read,
+               .write_u64      = bch_verify_write,
+       },
+       {
+               .name           = "cache_hits",
+               .read_u64       = bch_cache_hits_read,
+       },
+       {
+               .name           = "cache_misses",
+               .read_u64       = bch_cache_misses_read,
+       },
+       {
+               .name           = "cache_bypass_hits",
+               .read_u64       = bch_cache_bypass_hits_read,
+       },
+       {
+               .name           = "cache_bypass_misses",
+               .read_u64       = bch_cache_bypass_misses_read,
+       },
+       { }     /* terminate */
+};
+
+static void init_bch_cgroup(struct bch_cgroup *cg)
+{
+       cg->cache_mode = -1;
+}
+
+static struct cgroup_subsys_state *bcachecg_create(struct cgroup *cgroup)
+{
+       struct bch_cgroup *cg;
+
+       cg = kzalloc(sizeof(*cg), GFP_KERNEL);
+       if (!cg)
+               return ERR_PTR(-ENOMEM);
+       init_bch_cgroup(cg);
+       return &cg->css;
+}
+
+static void bcachecg_destroy(struct cgroup *cgroup)
+{
+       struct bch_cgroup *cg = cgroup_to_bcache(cgroup);
+       free_css_id(&bcache_subsys, &cg->css);
+       kfree(cg);
+}
+
+struct cgroup_subsys bcache_subsys = {
+       .create         = bcachecg_create,
+       .destroy        = bcachecg_destroy,
+       .subsys_id      = bcache_subsys_id,
+       .name           = "bcache",
+       .module         = THIS_MODULE,
+};
+EXPORT_SYMBOL_GPL(bcache_subsys);
+#endif
+
+static unsigned cache_mode(struct cached_dev *dc, struct bio *bio)
+{
+#ifdef CONFIG_CGROUP_BCACHE
+       int r = bch_bio_to_cgroup(bio)->cache_mode;
+       if (r >= 0)
+               return r;
+#endif
+       return BDEV_CACHE_MODE(&dc->sb);
+}
+
+static bool verify(struct cached_dev *dc, struct bio *bio)
+{
+#ifdef CONFIG_CGROUP_BCACHE
+       if (bch_bio_to_cgroup(bio)->verify)
+               return true;
+#endif
+       return dc->verify;
+}
+
+static void bio_csum(struct bio *bio, struct bkey *k)
+{
+       struct bio_vec *bv;
+       uint64_t csum = 0;
+       int i;
+
+       bio_for_each_segment(bv, bio, i) {
+               void *d = kmap(bv->bv_page) + bv->bv_offset;
+               csum = bch_crc64_update(csum, d, bv->bv_len);
+               kunmap(bv->bv_page);
+       }
+
+       k->ptr[KEY_PTRS(k)] = csum & (~0ULL >> 1);
+}
+
+/* Insert data into cache */
+
+static void bio_invalidate(struct closure *cl)
+{
+       struct btree_op *op = container_of(cl, struct btree_op, cl);
+       struct bio *bio = op->cache_bio;
+
+       pr_debug("invalidating %i sectors from %llu",
+                bio_sectors(bio), (uint64_t) bio->bi_sector);
+
+       while (bio_sectors(bio)) {
+               unsigned len = min(bio_sectors(bio), 1U << 14);
+
+               if (bch_keylist_realloc(&op->keys, 0, op->c))
+                       goto out;
+
+               bio->bi_sector  += len;
+               bio->bi_size    -= len << 9;
+
+               bch_keylist_add(&op->keys,
+                               &KEY(op->inode, bio->bi_sector, len));
+       }
+
+       op->insert_data_done = true;
+       bio_put(bio);
+out:
+       continue_at(cl, bch_journal, bcache_wq);
+}
+
+struct open_bucket {
+       struct list_head        list;
+       struct task_struct      *last;
+       unsigned                sectors_free;
+       BKEY_PADDED(key);
+};
+
+void bch_open_buckets_free(struct cache_set *c)
+{
+       struct open_bucket *b;
+
+       while (!list_empty(&c->data_buckets)) {
+               b = list_first_entry(&c->data_buckets,
+                                    struct open_bucket, list);
+               list_del(&b->list);
+               kfree(b);
+       }
+}
+
+int bch_open_buckets_alloc(struct cache_set *c)
+{
+       int i;
+
+       spin_lock_init(&c->data_bucket_lock);
+
+       for (i = 0; i < 6; i++) {
+               struct open_bucket *b = kzalloc(sizeof(*b), GFP_KERNEL);
+               if (!b)
+                       return -ENOMEM;
+
+               list_add(&b->list, &c->data_buckets);
+       }
+
+       return 0;
+}
+
+/*
+ * We keep multiple buckets open for writes, and try to segregate different
+ * write streams for better cache utilization: first we look for a bucket where
+ * the last write to it was sequential with the current write, and failing that
+ * we look for a bucket that was last used by the same task.
+ *
+ * The ideas is if you've got multiple tasks pulling data into the cache at the
+ * same time, you'll get better cache utilization if you try to segregate their
+ * data and preserve locality.
+ *
+ * For example, say you've starting Firefox at the same time you're copying a
+ * bunch of files. Firefox will likely end up being fairly hot and stay in the
+ * cache awhile, but the data you copied might not be; if you wrote all that
+ * data to the same buckets it'd get invalidated at the same time.
+ *
+ * Both of those tasks will be doing fairly random IO so we can't rely on
+ * detecting sequential IO to segregate their data, but going off of the task
+ * should be a sane heuristic.
+ */
+static struct open_bucket *pick_data_bucket(struct cache_set *c,
+                                           const struct bkey *search,
+                                           struct task_struct *task,
+                                           struct bkey *alloc)
+{
+       struct open_bucket *ret, *ret_task = NULL;
+
+       list_for_each_entry_reverse(ret, &c->data_buckets, list)
+               if (!bkey_cmp(&ret->key, search))
+                       goto found;
+               else if (ret->last == task)
+                       ret_task = ret;
+
+       ret = ret_task ?: list_first_entry(&c->data_buckets,
+                                          struct open_bucket, list);
+found:
+       if (!ret->sectors_free && KEY_PTRS(alloc)) {
+               ret->sectors_free = c->sb.bucket_size;
+               bkey_copy(&ret->key, alloc);
+               bkey_init(alloc);
+       }
+
+       if (!ret->sectors_free)
+               ret = NULL;
+
+       return ret;
+}
+
+/*
+ * Allocates some space in the cache to write to, and k to point to the newly
+ * allocated space, and updates KEY_SIZE(k) and KEY_OFFSET(k) (to point to the
+ * end of the newly allocated space).
+ *
+ * May allocate fewer sectors than @sectors, KEY_SIZE(k) indicates how many
+ * sectors were actually allocated.
+ *
+ * If s->writeback is true, will not fail.
+ */
+static bool bch_alloc_sectors(struct bkey *k, unsigned sectors,
+                             struct search *s)
+{
+       struct cache_set *c = s->op.c;
+       struct open_bucket *b;
+       BKEY_PADDED(key) alloc;
+       struct closure cl, *w = NULL;
+       unsigned i;
+
+       if (s->writeback) {
+               closure_init_stack(&cl);
+               w = &cl;
+       }
+
+       /*
+        * We might have to allocate a new bucket, which we can't do with a
+        * spinlock held. So if we have to allocate, we drop the lock, allocate
+        * and then retry. KEY_PTRS() indicates whether alloc points to
+        * allocated bucket(s).
+        */
+
+       bkey_init(&alloc.key);
+       spin_lock(&c->data_bucket_lock);
+
+       while (!(b = pick_data_bucket(c, k, s->task, &alloc.key))) {
+               unsigned watermark = s->op.write_prio
+                       ? WATERMARK_MOVINGGC
+                       : WATERMARK_NONE;
+
+               spin_unlock(&c->data_bucket_lock);
+
+               if (bch_bucket_alloc_set(c, watermark, &alloc.key, 1, w))
+                       return false;
+
+               spin_lock(&c->data_bucket_lock);
+       }
+
+       /*
+        * If we had to allocate, we might race and not need to allocate the
+        * second time we call find_data_bucket(). If we allocated a bucket but
+        * didn't use it, drop the refcount bch_bucket_alloc_set() took:
+        */
+       if (KEY_PTRS(&alloc.key))
+               __bkey_put(c, &alloc.key);
+
+       for (i = 0; i < KEY_PTRS(&b->key); i++)
+               EBUG_ON(ptr_stale(c, &b->key, i));
+
+       /* Set up the pointer to the space we're allocating: */
+
+       for (i = 0; i < KEY_PTRS(&b->key); i++)
+               k->ptr[i] = b->key.ptr[i];
+
+       sectors = min(sectors, b->sectors_free);
+
+       SET_KEY_OFFSET(k, KEY_OFFSET(k) + sectors);
+       SET_KEY_SIZE(k, sectors);
+       SET_KEY_PTRS(k, KEY_PTRS(&b->key));
+
+       /*
+        * Move b to the end of the lru, and keep track of what this bucket was
+        * last used for:
+        */
+       list_move_tail(&b->list, &c->data_buckets);
+       bkey_copy_key(&b->key, k);
+       b->last = s->task;
+
+       b->sectors_free -= sectors;
+
+       for (i = 0; i < KEY_PTRS(&b->key); i++) {
+               SET_PTR_OFFSET(&b->key, i, PTR_OFFSET(&b->key, i) + sectors);
+
+               atomic_long_add(sectors,
+                               &PTR_CACHE(c, &b->key, i)->sectors_written);
+       }
+
+       if (b->sectors_free < c->sb.block_size)
+               b->sectors_free = 0;
+
+       /*
+        * k takes refcounts on the buckets it points to until it's inserted
+        * into the btree, but if we're done with this bucket we just transfer
+        * get_data_bucket()'s refcount.
+        */
+       if (b->sectors_free)
+               for (i = 0; i < KEY_PTRS(&b->key); i++)
+                       atomic_inc(&PTR_BUCKET(c, &b->key, i)->pin);
+
+       spin_unlock(&c->data_bucket_lock);
+       return true;
+}
+
+static void bch_insert_data_error(struct closure *cl)
+{
+       struct btree_op *op = container_of(cl, struct btree_op, cl);
+
+       /*
+        * Our data write just errored, which means we've got a bunch of keys to
+        * insert that point to data that wasn't succesfully written.
+        *
+        * We don't have to insert those keys but we still have to invalidate
+        * that region of the cache - so, if we just strip off all the pointers
+        * from the keys we'll accomplish just that.
+        */
+
+       struct bkey *src = op->keys.bottom, *dst = op->keys.bottom;
+
+       while (src != op->keys.top) {
+               struct bkey *n = bkey_next(src);
+
+               SET_KEY_PTRS(src, 0);
+               bkey_copy(dst, src);
+
+               dst = bkey_next(dst);
+               src = n;
+       }
+
+       op->keys.top = dst;
+
+       bch_journal(cl);
+}
+
+static void bch_insert_data_endio(struct bio *bio, int error)
+{
+       struct closure *cl = bio->bi_private;
+       struct btree_op *op = container_of(cl, struct btree_op, cl);
+       struct search *s = container_of(op, struct search, op);
+
+       if (error) {
+               /* TODO: We could try to recover from this. */
+               if (s->writeback)
+                       s->error = error;
+               else if (s->write)
+                       set_closure_fn(cl, bch_insert_data_error, bcache_wq);
+               else
+                       set_closure_fn(cl, NULL, NULL);
+       }
+
+       bch_bbio_endio(op->c, bio, error, "writing data to cache");
+}
+
+static void bch_insert_data_loop(struct closure *cl)
+{
+       struct btree_op *op = container_of(cl, struct btree_op, cl);
+       struct search *s = container_of(op, struct search, op);
+       struct bio *bio = op->cache_bio, *n;
+
+       if (op->skip)
+               return bio_invalidate(cl);
+
+       if (atomic_sub_return(bio_sectors(bio), &op->c->sectors_to_gc) < 0) {
+               set_gc_sectors(op->c);
+               bch_queue_gc(op->c);
+       }
+
+       do {
+               unsigned i;
+               struct bkey *k;
+               struct bio_set *split = s->d
+                       ? s->d->bio_split : op->c->bio_split;
+
+               /* 1 for the device pointer and 1 for the chksum */
+               if (bch_keylist_realloc(&op->keys,
+                                       1 + (op->csum ? 1 : 0),
+                                       op->c))
+                       continue_at(cl, bch_journal, bcache_wq);
+
+               k = op->keys.top;
+               bkey_init(k);
+               SET_KEY_INODE(k, op->inode);
+               SET_KEY_OFFSET(k, bio->bi_sector);
+
+               if (!bch_alloc_sectors(k, bio_sectors(bio), s))
+                       goto err;
+
+               n = bch_bio_split(bio, KEY_SIZE(k), GFP_NOIO, split);
+               if (!n) {
+                       __bkey_put(op->c, k);
+                       continue_at(cl, bch_insert_data_loop, bcache_wq);
+               }
+
+               n->bi_end_io    = bch_insert_data_endio;
+               n->bi_private   = cl;
+
+               if (s->writeback) {
+                       SET_KEY_DIRTY(k, true);
+
+                       for (i = 0; i < KEY_PTRS(k); i++)
+                               SET_GC_MARK(PTR_BUCKET(op->c, k, i),
+                                           GC_MARK_DIRTY);
+               }
+
+               SET_KEY_CSUM(k, op->csum);
+               if (KEY_CSUM(k))
+                       bio_csum(n, k);
+
+               pr_debug("%s", pkey(k));
+               bch_keylist_push(&op->keys);
+
+               trace_bcache_cache_insert(n, n->bi_sector, n->bi_bdev);
+               n->bi_rw |= REQ_WRITE;
+               bch_submit_bbio(n, op->c, k, 0);
+       } while (n != bio);
+
+       op->insert_data_done = true;
+       continue_at(cl, bch_journal, bcache_wq);
+err:
+       /* bch_alloc_sectors() blocks if s->writeback = true */
+       BUG_ON(s->writeback);
+
+       /*
+        * But if it's not a writeback write we'd rather just bail out if
+        * there aren't any buckets ready to write to - it might take awhile and
+        * we might be starving btree writes for gc or something.
+        */
+
+       if (s->write) {
+               /*
+                * Writethrough write: We can't complete the write until we've
+                * updated the index. But we don't want to delay the write while
+                * we wait for buckets to be freed up, so just invalidate the
+                * rest of the write.
+                */
+               op->skip = true;
+               return bio_invalidate(cl);
+       } else {
+               /*
+                * From a cache miss, we can just insert the keys for the data
+                * we have written or bail out if we didn't do anything.
+                */
+               op->insert_data_done = true;
+               bio_put(bio);
+
+               if (!bch_keylist_empty(&op->keys))
+                       continue_at(cl, bch_journal, bcache_wq);
+               else
+                       closure_return(cl);
+       }
+}
+
+/**
+ * bch_insert_data - stick some data in the cache
+ *
+ * This is the starting point for any data to end up in a cache device; it could
+ * be from a normal write, or a writeback write, or a write to a flash only
+ * volume - it's also used by the moving garbage collector to compact data in
+ * mostly empty buckets.
+ *
+ * It first writes the data to the cache, creating a list of keys to be inserted
+ * (if the data had to be fragmented there will be multiple keys); after the
+ * data is written it calls bch_journal, and after the keys have been added to
+ * the next journal write they're inserted into the btree.
+ *
+ * It inserts the data in op->cache_bio; bi_sector is used for the key offset,
+ * and op->inode is used for the key inode.
+ *
+ * If op->skip is true, instead of inserting the data it invalidates the region
+ * of the cache represented by op->cache_bio and op->inode.
+ */
+void bch_insert_data(struct closure *cl)
+{
+       struct btree_op *op = container_of(cl, struct btree_op, cl);
+
+       bch_keylist_init(&op->keys);
+       bio_get(op->cache_bio);
+       bch_insert_data_loop(cl);
+}
+
+void bch_btree_insert_async(struct closure *cl)
+{
+       struct btree_op *op = container_of(cl, struct btree_op, cl);
+       struct search *s = container_of(op, struct search, op);
+
+       if (bch_btree_insert(op, op->c)) {
+               s->error                = -ENOMEM;
+               op->insert_data_done    = true;
+       }
+
+       if (op->insert_data_done) {
+               bch_keylist_free(&op->keys);
+               closure_return(cl);
+       } else
+               continue_at(cl, bch_insert_data_loop, bcache_wq);
+}
+
+/* Common code for the make_request functions */
+
+static void request_endio(struct bio *bio, int error)
+{
+       struct closure *cl = bio->bi_private;
+
+       if (error) {
+               struct search *s = container_of(cl, struct search, cl);
+               s->error = error;
+               /* Only cache read errors are recoverable */
+               s->recoverable = false;
+       }
+
+       bio_put(bio);
+       closure_put(cl);
+}
+
+void bch_cache_read_endio(struct bio *bio, int error)
+{
+       struct bbio *b = container_of(bio, struct bbio, bio);
+       struct closure *cl = bio->bi_private;
+       struct search *s = container_of(cl, struct search, cl);
+
+       /*
+        * If the bucket was reused while our bio was in flight, we might have
+        * read the wrong data. Set s->error but not error so it doesn't get
+        * counted against the cache device, but we'll still reread the data
+        * from the backing device.
+        */
+
+       if (error)
+               s->error = error;
+       else if (ptr_stale(s->op.c, &b->key, 0)) {
+               atomic_long_inc(&s->op.c->cache_read_races);
+               s->error = -EINTR;
+       }
+
+       bch_bbio_endio(s->op.c, bio, error, "reading from cache");
+}
+
+static void bio_complete(struct search *s)
+{
+       if (s->orig_bio) {
+               int cpu, rw = bio_data_dir(s->orig_bio);
+               unsigned long duration = jiffies - s->start_time;
+
+               cpu = part_stat_lock();
+               part_round_stats(cpu, &s->d->disk->part0);
+               part_stat_add(cpu, &s->d->disk->part0, ticks[rw], duration);
+               part_stat_unlock();
+
+               trace_bcache_request_end(s, s->orig_bio);
+               bio_endio(s->orig_bio, s->error);
+               s->orig_bio = NULL;
+       }
+}
+
+static void do_bio_hook(struct search *s)
+{
+       struct bio *bio = &s->bio.bio;
+       memcpy(bio, s->orig_bio, sizeof(struct bio));
+
+       bio->bi_end_io          = request_endio;
+       bio->bi_private         = &s->cl;
+       atomic_set(&bio->bi_cnt, 3);
+}
+
+static void search_free(struct closure *cl)
+{
+       struct search *s = container_of(cl, struct search, cl);
+       bio_complete(s);
+
+       if (s->op.cache_bio)
+               bio_put(s->op.cache_bio);
+
+       if (s->unaligned_bvec)
+               mempool_free(s->bio.bio.bi_io_vec, s->d->unaligned_bvec);
+
+       closure_debug_destroy(cl);
+       mempool_free(s, s->d->c->search);
+}
+
+static struct search *search_alloc(struct bio *bio, struct bcache_device *d)
+{
+       struct bio_vec *bv;
+       struct search *s = mempool_alloc(d->c->search, GFP_NOIO);
+       memset(s, 0, offsetof(struct search, op.keys));
+
+       __closure_init(&s->cl, NULL);
+
+       s->op.inode             = d->id;
+       s->op.c                 = d->c;
+       s->d                    = d;
+       s->op.lock              = -1;
+       s->task                 = current;
+       s->orig_bio             = bio;
+       s->write                = (bio->bi_rw & REQ_WRITE) != 0;
+       s->op.flush_journal     = (bio->bi_rw & REQ_FLUSH) != 0;
+       s->op.skip              = (bio->bi_rw & REQ_DISCARD) != 0;
+       s->recoverable          = 1;
+       s->start_time           = jiffies;
+       do_bio_hook(s);
+
+       if (bio->bi_size != bio_segments(bio) * PAGE_SIZE) {
+               bv = mempool_alloc(d->unaligned_bvec, GFP_NOIO);
+               memcpy(bv, bio_iovec(bio),
+                      sizeof(struct bio_vec) * bio_segments(bio));
+
+               s->bio.bio.bi_io_vec    = bv;
+               s->unaligned_bvec       = 1;
+       }
+
+       return s;
+}
+
+static void btree_read_async(struct closure *cl)
+{
+       struct btree_op *op = container_of(cl, struct btree_op, cl);
+
+       int ret = btree_root(search_recurse, op->c, op);
+
+       if (ret == -EAGAIN)
+               continue_at(cl, btree_read_async, bcache_wq);
+
+       closure_return(cl);
+}
+
+/* Cached devices */
+
+static void cached_dev_bio_complete(struct closure *cl)
+{
+       struct search *s = container_of(cl, struct search, cl);
+       struct cached_dev *dc = container_of(s->d, struct cached_dev, disk);
+
+       search_free(cl);
+       cached_dev_put(dc);
+}
+
+/* Process reads */
+
+static void cached_dev_read_complete(struct closure *cl)
+{
+       struct search *s = container_of(cl, struct search, cl);
+
+       if (s->op.insert_collision)
+               bch_mark_cache_miss_collision(s);
+
+       if (s->op.cache_bio) {
+               int i;
+               struct bio_vec *bv;
+
+               __bio_for_each_segment(bv, s->op.cache_bio, i, 0)
+                       __free_page(bv->bv_page);
+       }
+
+       cached_dev_bio_complete(cl);
+}
+
+static void request_read_error(struct closure *cl)
+{
+       struct search *s = container_of(cl, struct search, cl);
+       struct bio_vec *bv;
+       int i;
+
+       if (s->recoverable) {
+               /* The cache read failed, but we can retry from the backing
+                * device.
+                */
+               pr_debug("recovering at sector %llu",
+                        (uint64_t) s->orig_bio->bi_sector);
+
+               s->error = 0;
+               bv = s->bio.bio.bi_io_vec;
+               do_bio_hook(s);
+               s->bio.bio.bi_io_vec = bv;
+
+               if (!s->unaligned_bvec)
+                       bio_for_each_segment(bv, s->orig_bio, i)
+                               bv->bv_offset = 0, bv->bv_len = PAGE_SIZE;
+               else
+                       memcpy(s->bio.bio.bi_io_vec,
+                              bio_iovec(s->orig_bio),
+                              sizeof(struct bio_vec) *
+                              bio_segments(s->orig_bio));
+
+               /* XXX: invalidate cache */
+
+               trace_bcache_read_retry(&s->bio.bio);
+               closure_bio_submit(&s->bio.bio, &s->cl, s->d);
+       }
+
+       continue_at(cl, cached_dev_read_complete, NULL);
+}
+
+static void request_read_done(struct closure *cl)
+{
+       struct search *s = container_of(cl, struct search, cl);
+       struct cached_dev *dc = container_of(s->d, struct cached_dev, disk);
+
+       /*
+        * s->cache_bio != NULL implies that we had a cache miss; cache_bio now
+        * contains data ready to be inserted into the cache.
+        *
+        * First, we copy the data we just read from cache_bio's bounce buffers
+        * to the buffers the original bio pointed to:
+        */
+
+       if (s->op.cache_bio) {
+               struct bio_vec *src, *dst;
+               unsigned src_offset, dst_offset, bytes;
+               void *dst_ptr;
+
+               bio_reset(s->op.cache_bio);
+               s->op.cache_bio->bi_sector      = s->cache_miss->bi_sector;
+               s->op.cache_bio->bi_bdev        = s->cache_miss->bi_bdev;
+               s->op.cache_bio->bi_size        = s->cache_bio_sectors << 9;
+               bch_bio_map(s->op.cache_bio, NULL);
+
+               src = bio_iovec(s->op.cache_bio);
+               dst = bio_iovec(s->cache_miss);
+               src_offset = src->bv_offset;
+               dst_offset = dst->bv_offset;
+               dst_ptr = kmap(dst->bv_page);
+
+               while (1) {
+                       if (dst_offset == dst->bv_offset + dst->bv_len) {
+                               kunmap(dst->bv_page);
+                               dst++;
+                               if (dst == bio_iovec_idx(s->cache_miss,
+                                               s->cache_miss->bi_vcnt))
+                                       break;
+
+                               dst_offset = dst->bv_offset;
+                               dst_ptr = kmap(dst->bv_page);
+                       }
+
+                       if (src_offset == src->bv_offset + src->bv_len) {
+                               src++;
+                               if (src == bio_iovec_idx(s->op.cache_bio,
+                                                s->op.cache_bio->bi_vcnt))
+                                       BUG();
+
+                               src_offset = src->bv_offset;
+                       }
+
+                       bytes = min(dst->bv_offset + dst->bv_len - dst_offset,
+                                   src->bv_offset + src->bv_len - src_offset);
+
+                       memcpy(dst_ptr + dst_offset,
+                              page_address(src->bv_page) + src_offset,
+                              bytes);
+
+                       src_offset      += bytes;
+                       dst_offset      += bytes;
+               }
+
+               bio_put(s->cache_miss);
+               s->cache_miss = NULL;
+       }
+
+       if (verify(dc, &s->bio.bio) && s->recoverable)
+               bch_data_verify(s);
+
+       bio_complete(s);
+
+       if (s->op.cache_bio &&
+           !test_bit(CACHE_SET_STOPPING, &s->op.c->flags)) {
+               s->op.type = BTREE_REPLACE;
+               closure_call(&s->op.cl, bch_insert_data, NULL, cl);
+       }
+
+       continue_at(cl, cached_dev_read_complete, NULL);
+}
+
+static void request_read_done_bh(struct closure *cl)
+{
+       struct search *s = container_of(cl, struct search, cl);
+       struct cached_dev *dc = container_of(s->d, struct cached_dev, disk);
+
+       bch_mark_cache_accounting(s, !s->cache_miss, s->op.skip);
+
+       if (s->error)
+               continue_at_nobarrier(cl, request_read_error, bcache_wq);
+       else if (s->op.cache_bio || verify(dc, &s->bio.bio))
+               continue_at_nobarrier(cl, request_read_done, bcache_wq);
+       else
+               continue_at_nobarrier(cl, cached_dev_read_complete, NULL);
+}
+
+static int cached_dev_cache_miss(struct btree *b, struct search *s,
+                                struct bio *bio, unsigned sectors)
+{
+       int ret = 0;
+       unsigned reada;
+       struct cached_dev *dc = container_of(s->d, struct cached_dev, disk);
+       struct bio *miss;
+
+       miss = bch_bio_split(bio, sectors, GFP_NOIO, s->d->bio_split);
+       if (!miss)
+               return -EAGAIN;
+
+       if (miss == bio)
+               s->op.lookup_done = true;
+
+       miss->bi_end_io         = request_endio;
+       miss->bi_private        = &s->cl;
+
+       if (s->cache_miss || s->op.skip)
+               goto out_submit;
+
+       if (miss != bio ||
+           (bio->bi_rw & REQ_RAHEAD) ||
+           (bio->bi_rw & REQ_META) ||
+           s->op.c->gc_stats.in_use >= CUTOFF_CACHE_READA)
+               reada = 0;
+       else {
+               reada = min(dc->readahead >> 9,
+                           sectors - bio_sectors(miss));
+
+               if (bio_end(miss) + reada > bdev_sectors(miss->bi_bdev))
+                       reada = bdev_sectors(miss->bi_bdev) - bio_end(miss);
+       }
+
+       s->cache_bio_sectors = bio_sectors(miss) + reada;
+       s->op.cache_bio = bio_alloc_bioset(GFP_NOWAIT,
+                       DIV_ROUND_UP(s->cache_bio_sectors, PAGE_SECTORS),
+                       dc->disk.bio_split);
+
+       if (!s->op.cache_bio)
+               goto out_submit;
+
+       s->op.cache_bio->bi_sector      = miss->bi_sector;
+       s->op.cache_bio->bi_bdev        = miss->bi_bdev;
+       s->op.cache_bio->bi_size        = s->cache_bio_sectors << 9;
+
+       s->op.cache_bio->bi_end_io      = request_endio;
+       s->op.cache_bio->bi_private     = &s->cl;
+
+       /* btree_search_recurse()'s btree iterator is no good anymore */
+       ret = -EINTR;
+       if (!bch_btree_insert_check_key(b, &s->op, s->op.cache_bio))
+               goto out_put;
+
+       bch_bio_map(s->op.cache_bio, NULL);
+       if (bch_bio_alloc_pages(s->op.cache_bio, __GFP_NOWARN|GFP_NOIO))
+               goto out_put;
+
+       s->cache_miss = miss;
+       bio_get(s->op.cache_bio);
+
+       trace_bcache_cache_miss(s->orig_bio);
+       closure_bio_submit(s->op.cache_bio, &s->cl, s->d);
+
+       return ret;
+out_put:
+       bio_put(s->op.cache_bio);
+       s->op.cache_bio = NULL;
+out_submit:
+       closure_bio_submit(miss, &s->cl, s->d);
+       return ret;
+}
+
+static void request_read(struct cached_dev *dc, struct search *s)
+{
+       struct closure *cl = &s->cl;
+
+       check_should_skip(dc, s);
+       closure_call(&s->op.cl, btree_read_async, NULL, cl);
+
+       continue_at(cl, request_read_done_bh, NULL);
+}
+
+/* Process writes */
+
+static void cached_dev_write_complete(struct closure *cl)
+{
+       struct search *s = container_of(cl, struct search, cl);
+       struct cached_dev *dc = container_of(s->d, struct cached_dev, disk);
+
+       up_read_non_owner(&dc->writeback_lock);
+       cached_dev_bio_complete(cl);
+}
+
+static bool should_writeback(struct cached_dev *dc, struct bio *bio)
+{
+       unsigned threshold = (bio->bi_rw & REQ_SYNC)
+               ? CUTOFF_WRITEBACK_SYNC
+               : CUTOFF_WRITEBACK;
+
+       return !atomic_read(&dc->disk.detaching) &&
+               cache_mode(dc, bio) == CACHE_MODE_WRITEBACK &&
+               dc->disk.c->gc_stats.in_use < threshold;
+}
+
+static void request_write(struct cached_dev *dc, struct search *s)
+{
+       struct closure *cl = &s->cl;
+       struct bio *bio = &s->bio.bio;
+       struct bkey start, end;
+       start = KEY(dc->disk.id, bio->bi_sector, 0);
+       end = KEY(dc->disk.id, bio_end(bio), 0);
+
+       bch_keybuf_check_overlapping(&s->op.c->moving_gc_keys, &start, &end);
+
+       check_should_skip(dc, s);
+       down_read_non_owner(&dc->writeback_lock);
+
+       if (bch_keybuf_check_overlapping(&dc->writeback_keys, &start, &end)) {
+               s->op.skip      = false;
+               s->writeback    = true;
+       }
+
+       if (bio->bi_rw & REQ_DISCARD)
+               goto skip;
+
+       if (s->op.skip)
+               goto skip;
+
+       if (should_writeback(dc, s->orig_bio))
+               s->writeback = true;
+
+       if (!s->writeback) {
+               s->op.cache_bio = bio_clone_bioset(bio, GFP_NOIO,
+                                                  dc->disk.bio_split);
+
+               trace_bcache_writethrough(s->orig_bio);
+               closure_bio_submit(bio, cl, s->d);
+       } else {
+               s->op.cache_bio = bio;
+               trace_bcache_writeback(s->orig_bio);
+               bch_writeback_add(dc, bio_sectors(bio));
+       }
+out:
+       closure_call(&s->op.cl, bch_insert_data, NULL, cl);
+       continue_at(cl, cached_dev_write_complete, NULL);
+skip:
+       s->op.skip = true;
+       s->op.cache_bio = s->orig_bio;
+       bio_get(s->op.cache_bio);
+       trace_bcache_write_skip(s->orig_bio);
+
+       if ((bio->bi_rw & REQ_DISCARD) &&
+           !blk_queue_discard(bdev_get_queue(dc->bdev)))
+               goto out;
+
+       closure_bio_submit(bio, cl, s->d);
+       goto out;
+}
+
+static void request_nodata(struct cached_dev *dc, struct search *s)
+{
+       struct closure *cl = &s->cl;
+       struct bio *bio = &s->bio.bio;
+
+       if (bio->bi_rw & REQ_DISCARD) {
+               request_write(dc, s);
+               return;
+       }
+
+       if (s->op.flush_journal)
+               bch_journal_meta(s->op.c, cl);
+
+       closure_bio_submit(bio, cl, s->d);
+
+       continue_at(cl, cached_dev_bio_complete, NULL);
+}
+
+/* Cached devices - read & write stuff */
+
+int bch_get_congested(struct cache_set *c)
+{
+       int i;
+
+       if (!c->congested_read_threshold_us &&
+           !c->congested_write_threshold_us)
+               return 0;
+
+       i = (local_clock_us() - c->congested_last_us) / 1024;
+       if (i < 0)
+               return 0;
+
+       i += atomic_read(&c->congested);
+       if (i >= 0)
+               return 0;
+
+       i += CONGESTED_MAX;
+
+       return i <= 0 ? 1 : fract_exp_two(i, 6);
+}
+
+static void add_sequential(struct task_struct *t)
+{
+       ewma_add(t->sequential_io_avg,
+                t->sequential_io, 8, 0);
+
+       t->sequential_io = 0;
+}
+
+static struct hlist_head *iohash(struct cached_dev *dc, uint64_t k)
+{
+       return &dc->io_hash[hash_64(k, RECENT_IO_BITS)];
+}
+
+static void check_should_skip(struct cached_dev *dc, struct search *s)
+{
+       struct cache_set *c = s->op.c;
+       struct bio *bio = &s->bio.bio;
+
+       long rand;
+       int cutoff = bch_get_congested(c);
+       unsigned mode = cache_mode(dc, bio);
+
+       if (atomic_read(&dc->disk.detaching) ||
+           c->gc_stats.in_use > CUTOFF_CACHE_ADD ||
+           (bio->bi_rw & REQ_DISCARD))
+               goto skip;
+
+       if (mode == CACHE_MODE_NONE ||
+           (mode == CACHE_MODE_WRITEAROUND &&
+            (bio->bi_rw & REQ_WRITE)))
+               goto skip;
+
+       if (bio->bi_sector   & (c->sb.block_size - 1) ||
+           bio_sectors(bio) & (c->sb.block_size - 1)) {
+               pr_debug("skipping unaligned io");
+               goto skip;
+       }
+
+       if (!cutoff) {
+               cutoff = dc->sequential_cutoff >> 9;
+
+               if (!cutoff)
+                       goto rescale;
+
+               if (mode == CACHE_MODE_WRITEBACK &&
+                   (bio->bi_rw & REQ_WRITE) &&
+                   (bio->bi_rw & REQ_SYNC))
+                       goto rescale;
+       }
+
+       if (dc->sequential_merge) {
+               struct io *i;
+
+               spin_lock(&dc->io_lock);
+
+               hlist_for_each_entry(i, iohash(dc, bio->bi_sector), hash)
+                       if (i->last == bio->bi_sector &&
+                           time_before(jiffies, i->jiffies))
+                               goto found;
+
+               i = list_first_entry(&dc->io_lru, struct io, lru);
+
+               add_sequential(s->task);
+               i->sequential = 0;
+found:
+               if (i->sequential + bio->bi_size > i->sequential)
+                       i->sequential   += bio->bi_size;
+
+               i->last                  = bio_end(bio);
+               i->jiffies               = jiffies + msecs_to_jiffies(5000);
+               s->task->sequential_io   = i->sequential;
+
+               hlist_del(&i->hash);
+               hlist_add_head(&i->hash, iohash(dc, i->last));
+               list_move_tail(&i->lru, &dc->io_lru);
+
+               spin_unlock(&dc->io_lock);
+       } else {
+               s->task->sequential_io = bio->bi_size;
+
+               add_sequential(s->task);
+       }
+
+       rand = get_random_int();
+       cutoff -= bitmap_weight(&rand, BITS_PER_LONG);
+
+       if (cutoff <= (int) (max(s->task->sequential_io,
+                                s->task->sequential_io_avg) >> 9))
+               goto skip;
+
+rescale:
+       bch_rescale_priorities(c, bio_sectors(bio));
+       return;
+skip:
+       bch_mark_sectors_bypassed(s, bio_sectors(bio));
+       s->op.skip = true;
+}
+
+static void cached_dev_make_request(struct request_queue *q, struct bio *bio)
+{
+       struct search *s;
+       struct bcache_device *d = bio->bi_bdev->bd_disk->private_data;
+       struct cached_dev *dc = container_of(d, struct cached_dev, disk);
+       int cpu, rw = bio_data_dir(bio);
+
+       cpu = part_stat_lock();
+       part_stat_inc(cpu, &d->disk->part0, ios[rw]);
+       part_stat_add(cpu, &d->disk->part0, sectors[rw], bio_sectors(bio));
+       part_stat_unlock();
+
+       bio->bi_bdev = dc->bdev;
+       bio->bi_sector += dc->sb.data_offset;
+
+       if (cached_dev_get(dc)) {
+               s = search_alloc(bio, d);
+               trace_bcache_request_start(s, bio);
+
+               if (!bio_has_data(bio))
+                       request_nodata(dc, s);
+               else if (rw)
+                       request_write(dc, s);
+               else
+                       request_read(dc, s);
+       } else {
+               if ((bio->bi_rw & REQ_DISCARD) &&
+                   !blk_queue_discard(bdev_get_queue(dc->bdev)))
+                       bio_endio(bio, 0);
+               else
+                       bch_generic_make_request(bio, &d->bio_split_hook);
+       }
+}
+
+static int cached_dev_ioctl(struct bcache_device *d, fmode_t mode,
+                           unsigned int cmd, unsigned long arg)
+{
+       struct cached_dev *dc = container_of(d, struct cached_dev, disk);
+       return __blkdev_driver_ioctl(dc->bdev, mode, cmd, arg);
+}
+
+static int cached_dev_congested(void *data, int bits)
+{
+       struct bcache_device *d = data;
+       struct cached_dev *dc = container_of(d, struct cached_dev, disk);
+       struct request_queue *q = bdev_get_queue(dc->bdev);
+       int ret = 0;
+
+       if (bdi_congested(&q->backing_dev_info, bits))
+               return 1;
+
+       if (cached_dev_get(dc)) {
+               unsigned i;
+               struct cache *ca;
+
+               for_each_cache(ca, d->c, i) {
+                       q = bdev_get_queue(ca->bdev);
+                       ret |= bdi_congested(&q->backing_dev_info, bits);
+               }
+
+               cached_dev_put(dc);
+       }
+
+       return ret;
+}
+
+void bch_cached_dev_request_init(struct cached_dev *dc)
+{
+       struct gendisk *g = dc->disk.disk;
+
+       g->queue->make_request_fn               = cached_dev_make_request;
+       g->queue->backing_dev_info.congested_fn = cached_dev_congested;
+       dc->disk.cache_miss                     = cached_dev_cache_miss;
+       dc->disk.ioctl                          = cached_dev_ioctl;
+}
+
+/* Flash backed devices */
+
+static int flash_dev_cache_miss(struct btree *b, struct search *s,
+                               struct bio *bio, unsigned sectors)
+{
+       /* Zero fill bio */
+
+       while (bio->bi_idx != bio->bi_vcnt) {
+               struct bio_vec *bv = bio_iovec(bio);
+               unsigned j = min(bv->bv_len >> 9, sectors);
+
+               void *p = kmap(bv->bv_page);
+               memset(p + bv->bv_offset, 0, j << 9);
+               kunmap(bv->bv_page);
+
+               bv->bv_len      -= j << 9;
+               bv->bv_offset   += j << 9;
+
+               if (bv->bv_len)
+                       return 0;
+
+               bio->bi_sector  += j;
+               bio->bi_size    -= j << 9;
+
+               bio->bi_idx++;
+               sectors         -= j;
+       }
+
+       s->op.lookup_done = true;
+
+       return 0;
+}
+
+static void flash_dev_make_request(struct request_queue *q, struct bio *bio)
+{
+       struct search *s;
+       struct closure *cl;
+       struct bcache_device *d = bio->bi_bdev->bd_disk->private_data;
+       int cpu, rw = bio_data_dir(bio);
+
+       cpu = part_stat_lock();
+       part_stat_inc(cpu, &d->disk->part0, ios[rw]);
+       part_stat_add(cpu, &d->disk->part0, sectors[rw], bio_sectors(bio));
+       part_stat_unlock();
+
+       s = search_alloc(bio, d);
+       cl = &s->cl;
+       bio = &s->bio.bio;
+
+       trace_bcache_request_start(s, bio);
+
+       if (bio_has_data(bio) && !rw) {
+               closure_call(&s->op.cl, btree_read_async, NULL, cl);
+       } else if (bio_has_data(bio) || s->op.skip) {
+               bch_keybuf_check_overlapping(&s->op.c->moving_gc_keys,
+                                            &KEY(d->id, bio->bi_sector, 0),
+                                            &KEY(d->id, bio_end(bio), 0));
+
+               s->writeback    = true;
+               s->op.cache_bio = bio;
+
+               closure_call(&s->op.cl, bch_insert_data, NULL, cl);
+       } else {
+               /* No data - probably a cache flush */
+               if (s->op.flush_journal)
+                       bch_journal_meta(s->op.c, cl);
+       }
+
+       continue_at(cl, search_free, NULL);
+}
+
+static int flash_dev_ioctl(struct bcache_device *d, fmode_t mode,
+                          unsigned int cmd, unsigned long arg)
+{
+       return -ENOTTY;
+}
+
+static int flash_dev_congested(void *data, int bits)
+{
+       struct bcache_device *d = data;
+       struct request_queue *q;
+       struct cache *ca;
+       unsigned i;
+       int ret = 0;
+
+       for_each_cache(ca, d->c, i) {
+               q = bdev_get_queue(ca->bdev);
+               ret |= bdi_congested(&q->backing_dev_info, bits);
+       }
+
+       return ret;
+}
+
+void bch_flash_dev_request_init(struct bcache_device *d)
+{
+       struct gendisk *g = d->disk;
+
+       g->queue->make_request_fn               = flash_dev_make_request;
+       g->queue->backing_dev_info.congested_fn = flash_dev_congested;
+       d->cache_miss                           = flash_dev_cache_miss;
+       d->ioctl                                = flash_dev_ioctl;
+}
+
+void bch_request_exit(void)
+{
+#ifdef CONFIG_CGROUP_BCACHE
+       cgroup_unload_subsys(&bcache_subsys);
+#endif
+       if (bch_search_cache)
+               kmem_cache_destroy(bch_search_cache);
+}
+
+int __init bch_request_init(void)
+{
+       bch_search_cache = KMEM_CACHE(search, 0);
+       if (!bch_search_cache)
+               return -ENOMEM;
+
+#ifdef CONFIG_CGROUP_BCACHE
+       cgroup_load_subsys(&bcache_subsys);
+       init_bch_cgroup(&bcache_default_cgroup);
+
+       cgroup_add_cftypes(&bcache_subsys, bch_files);
+#endif
+       return 0;
+}
diff --git a/drivers/md/bcache/request.h b/drivers/md/bcache/request.h

new file mode 100644 (file)

index 0000000..254d9ab
--- /dev/null
+++ b/drivers/md/bcache/request.h
@@ -0,0 +1,62 @@
+#ifndef _BCACHE_REQUEST_H_
+#define _BCACHE_REQUEST_H_
+
+#include <linux/cgroup.h>
+
+struct search {
+       /* Stack frame for bio_complete */
+       struct closure          cl;
+
+       struct bcache_device    *d;
+       struct task_struct      *task;
+
+       struct bbio             bio;
+       struct bio              *orig_bio;
+       struct bio              *cache_miss;
+       unsigned                cache_bio_sectors;
+
+       unsigned                recoverable:1;
+       unsigned                unaligned_bvec:1;
+
+       unsigned                write:1;
+       unsigned                writeback:1;
+
+       /* IO error returned to s->bio */
+       short                   error;
+       unsigned long           start_time;
+
+       /* Anything past op->keys won't get zeroed in do_bio_hook */
+       struct btree_op         op;
+};
+
+void bch_cache_read_endio(struct bio *, int);
+int bch_get_congested(struct cache_set *);
+void bch_insert_data(struct closure *cl);
+void bch_btree_insert_async(struct closure *);
+void bch_cache_read_endio(struct bio *, int);
+
+void bch_open_buckets_free(struct cache_set *);
+int bch_open_buckets_alloc(struct cache_set *);
+
+void bch_cached_dev_request_init(struct cached_dev *dc);
+void bch_flash_dev_request_init(struct bcache_device *d);
+
+extern struct kmem_cache *bch_search_cache, *bch_passthrough_cache;
+
+struct bch_cgroup {
+#ifdef CONFIG_CGROUP_BCACHE
+       struct cgroup_subsys_state      css;
+#endif
+       /*
+        * We subtract one from the index into bch_cache_modes[], so that
+        * default == -1; this makes it so the rest match up with d->cache_mode,
+        * and we use d->cache_mode if cgrp->cache_mode < 0
+        */
+       short                           cache_mode;
+       bool                            verify;
+       struct cache_stat_collector     stats;
+};
+
+struct bch_cgroup *bch_bio_to_cgroup(struct bio *bio);
+
+#endif /* _BCACHE_REQUEST_H_ */
diff --git a/drivers/md/bcache/stats.c b/drivers/md/bcache/stats.c

new file mode 100644 (file)

index 0000000..64e6794
--- /dev/null
+++ b/drivers/md/bcache/stats.c
@@ -0,0 +1,246 @@
+/*
+ * bcache stats code
+ *
+ * Copyright 2012 Google, Inc.
+ */
+
+#include "bcache.h"
+#include "stats.h"
+#include "btree.h"
+#include "request.h"
+#include "sysfs.h"
+
+/*
+ * We keep absolute totals of various statistics, and addionally a set of three
+ * rolling averages.
+ *
+ * Every so often, a timer goes off and rescales the rolling averages.
+ * accounting_rescale[] is how many times the timer has to go off before we
+ * rescale each set of numbers; that gets us half lives of 5 minutes, one hour,
+ * and one day.
+ *
+ * accounting_delay is how often the timer goes off - 22 times in 5 minutes,
+ * and accounting_weight is what we use to rescale:
+ *
+ * pow(31 / 32, 22) ~= 1/2
+ *
+ * So that we don't have to increment each set of numbers every time we (say)
+ * get a cache hit, we increment a single atomic_t in acc->collector, and when
+ * the rescale function runs it resets the atomic counter to 0 and adds its
+ * old value to each of the exported numbers.
+ *
+ * To reduce rounding error, the numbers in struct cache_stats are all
+ * stored left shifted by 16, and scaled back in the sysfs show() function.
+ */
+
+static const unsigned DAY_RESCALE              = 288;
+static const unsigned HOUR_RESCALE             = 12;
+static const unsigned FIVE_MINUTE_RESCALE      = 1;
+static const unsigned accounting_delay         = (HZ * 300) / 22;
+static const unsigned accounting_weight                = 32;
+
+/* sysfs reading/writing */
+
+read_attribute(cache_hits);
+read_attribute(cache_misses);
+read_attribute(cache_bypass_hits);
+read_attribute(cache_bypass_misses);
+read_attribute(cache_hit_ratio);
+read_attribute(cache_readaheads);
+read_attribute(cache_miss_collisions);
+read_attribute(bypassed);
+
+SHOW(bch_stats)
+{
+       struct cache_stats *s =
+               container_of(kobj, struct cache_stats, kobj);
+#define var(stat)              (s->stat >> 16)
+       var_print(cache_hits);
+       var_print(cache_misses);
+       var_print(cache_bypass_hits);
+       var_print(cache_bypass_misses);
+
+       sysfs_print(cache_hit_ratio,
+                   DIV_SAFE(var(cache_hits) * 100,
+                            var(cache_hits) + var(cache_misses)));
+
+       var_print(cache_readaheads);
+       var_print(cache_miss_collisions);
+       sysfs_hprint(bypassed,  var(sectors_bypassed) << 9);
+#undef var
+       return 0;
+}
+
+STORE(bch_stats)
+{
+       return size;
+}
+
+static void bch_stats_release(struct kobject *k)
+{
+}
+
+static struct attribute *bch_stats_files[] = {
+       &sysfs_cache_hits,
+       &sysfs_cache_misses,
+       &sysfs_cache_bypass_hits,
+       &sysfs_cache_bypass_misses,
+       &sysfs_cache_hit_ratio,
+       &sysfs_cache_readaheads,
+       &sysfs_cache_miss_collisions,
+       &sysfs_bypassed,
+       NULL
+};
+static KTYPE(bch_stats);
+
+static void scale_accounting(unsigned long data);
+
+void bch_cache_accounting_init(struct cache_accounting *acc,
+                              struct closure *parent)
+{
+       kobject_init(&acc->total.kobj,          &bch_stats_ktype);
+       kobject_init(&acc->five_minute.kobj,    &bch_stats_ktype);
+       kobject_init(&acc->hour.kobj,           &bch_stats_ktype);
+       kobject_init(&acc->day.kobj,            &bch_stats_ktype);
+
+       closure_init(&acc->cl, parent);
+       init_timer(&acc->timer);
+       acc->timer.expires      = jiffies + accounting_delay;
+       acc->timer.data         = (unsigned long) acc;
+       acc->timer.function     = scale_accounting;
+       add_timer(&acc->timer);
+}
+
+int bch_cache_accounting_add_kobjs(struct cache_accounting *acc,
+                                  struct kobject *parent)
+{
+       int ret = kobject_add(&acc->total.kobj, parent,
+                             "stats_total");
+       ret = ret ?: kobject_add(&acc->five_minute.kobj, parent,
+                                "stats_five_minute");
+       ret = ret ?: kobject_add(&acc->hour.kobj, parent,
+                                "stats_hour");
+       ret = ret ?: kobject_add(&acc->day.kobj, parent,
+                                "stats_day");
+       return ret;
+}
+
+void bch_cache_accounting_clear(struct cache_accounting *acc)
+{
+       memset(&acc->total.cache_hits,
+              0,
+              sizeof(unsigned long) * 7);
+}
+
+void bch_cache_accounting_destroy(struct cache_accounting *acc)
+{
+       kobject_put(&acc->total.kobj);
+       kobject_put(&acc->five_minute.kobj);
+       kobject_put(&acc->hour.kobj);
+       kobject_put(&acc->day.kobj);
+
+       atomic_set(&acc->closing, 1);
+       if (del_timer_sync(&acc->timer))
+               closure_return(&acc->cl);
+}
+
+/* EWMA scaling */
+
+static void scale_stat(unsigned long *stat)
+{
+       *stat =  ewma_add(*stat, 0, accounting_weight, 0);
+}
+
+static void scale_stats(struct cache_stats *stats, unsigned long rescale_at)
+{
+       if (++stats->rescale == rescale_at) {
+               stats->rescale = 0;
+               scale_stat(&stats->cache_hits);
+               scale_stat(&stats->cache_misses);
+               scale_stat(&stats->cache_bypass_hits);
+               scale_stat(&stats->cache_bypass_misses);
+               scale_stat(&stats->cache_readaheads);
+               scale_stat(&stats->cache_miss_collisions);
+               scale_stat(&stats->sectors_bypassed);
+       }
+}
+
+static void scale_accounting(unsigned long data)
+{
+       struct cache_accounting *acc = (struct cache_accounting *) data;
+
+#define move_stat(name) do {                                           \
+       unsigned t = atomic_xchg(&acc->collector.name, 0);              \
+       t <<= 16;                                                       \
+       acc->five_minute.name += t;                                     \
+       acc->hour.name += t;                                            \
+       acc->day.name += t;                                             \
+       acc->total.name += t;                                           \
+} while (0)
+
+       move_stat(cache_hits);
+       move_stat(cache_misses);
+       move_stat(cache_bypass_hits);
+       move_stat(cache_bypass_misses);
+       move_stat(cache_readaheads);
+       move_stat(cache_miss_collisions);
+       move_stat(sectors_bypassed);
+
+       scale_stats(&acc->total, 0);
+       scale_stats(&acc->day, DAY_RESCALE);
+       scale_stats(&acc->hour, HOUR_RESCALE);
+       scale_stats(&acc->five_minute, FIVE_MINUTE_RESCALE);
+
+       acc->timer.expires += accounting_delay;
+
+       if (!atomic_read(&acc->closing))
+               add_timer(&acc->timer);
+       else
+               closure_return(&acc->cl);
+}
+
+static void mark_cache_stats(struct cache_stat_collector *stats,
+                            bool hit, bool bypass)
+{
+       if (!bypass)
+               if (hit)
+                       atomic_inc(&stats->cache_hits);
+               else
+                       atomic_inc(&stats->cache_misses);
+       else
+               if (hit)
+                       atomic_inc(&stats->cache_bypass_hits);
+               else
+                       atomic_inc(&stats->cache_bypass_misses);
+}
+
+void bch_mark_cache_accounting(struct search *s, bool hit, bool bypass)
+{
+       struct cached_dev *dc = container_of(s->d, struct cached_dev, disk);
+       mark_cache_stats(&dc->accounting.collector, hit, bypass);
+       mark_cache_stats(&s->op.c->accounting.collector, hit, bypass);
+#ifdef CONFIG_CGROUP_BCACHE
+       mark_cache_stats(&(bch_bio_to_cgroup(s->orig_bio)->stats), hit, bypass);
+#endif
+}
+
+void bch_mark_cache_readahead(struct search *s)
+{
+       struct cached_dev *dc = container_of(s->d, struct cached_dev, disk);
+       atomic_inc(&dc->accounting.collector.cache_readaheads);
+       atomic_inc(&s->op.c->accounting.collector.cache_readaheads);
+}
+
+void bch_mark_cache_miss_collision(struct search *s)
+{
+       struct cached_dev *dc = container_of(s->d, struct cached_dev, disk);
+       atomic_inc(&dc->accounting.collector.cache_miss_collisions);
+       atomic_inc(&s->op.c->accounting.collector.cache_miss_collisions);
+}
+
+void bch_mark_sectors_bypassed(struct search *s, int sectors)
+{
+       struct cached_dev *dc = container_of(s->d, struct cached_dev, disk);
+       atomic_add(sectors, &dc->accounting.collector.sectors_bypassed);
+       atomic_add(sectors, &s->op.c->accounting.collector.sectors_bypassed);
+}
diff --git a/drivers/md/bcache/stats.h b/drivers/md/bcache/stats.h

new file mode 100644 (file)

index 0000000..c7c7a8f
--- /dev/null
+++ b/drivers/md/bcache/stats.h
@@ -0,0 +1,58 @@
+#ifndef _BCACHE_STATS_H_
+#define _BCACHE_STATS_H_
+
+struct cache_stat_collector {
+       atomic_t cache_hits;
+       atomic_t cache_misses;
+       atomic_t cache_bypass_hits;
+       atomic_t cache_bypass_misses;
+       atomic_t cache_readaheads;
+       atomic_t cache_miss_collisions;
+       atomic_t sectors_bypassed;
+};
+
+struct cache_stats {
+       struct kobject          kobj;
+
+       unsigned long cache_hits;
+       unsigned long cache_misses;
+       unsigned long cache_bypass_hits;
+       unsigned long cache_bypass_misses;
+       unsigned long cache_readaheads;
+       unsigned long cache_miss_collisions;
+       unsigned long sectors_bypassed;
+
+       unsigned                rescale;
+};
+
+struct cache_accounting {
+       struct closure          cl;
+       struct timer_list       timer;
+       atomic_t                closing;
+
+       struct cache_stat_collector collector;
+
+       struct cache_stats total;
+       struct cache_stats five_minute;
+       struct cache_stats hour;
+       struct cache_stats day;
+};
+
+struct search;
+
+void bch_cache_accounting_init(struct cache_accounting *acc,
+                              struct closure *parent);
+
+int bch_cache_accounting_add_kobjs(struct cache_accounting *acc,
+                                  struct kobject *parent);
+
+void bch_cache_accounting_clear(struct cache_accounting *acc);
+
+void bch_cache_accounting_destroy(struct cache_accounting *acc);
+
+void bch_mark_cache_accounting(struct search *s, bool hit, bool bypass);
+void bch_mark_cache_readahead(struct search *s);
+void bch_mark_cache_miss_collision(struct search *s);
+void bch_mark_sectors_bypassed(struct search *s, int sectors);
+
+#endif /* _BCACHE_STATS_H_ */
diff --git a/drivers/md/bcache/super.c b/drivers/md/bcache/super.c

new file mode 100644 (file)

index 0000000..c8046bc
--- /dev/null
+++ b/drivers/md/bcache/super.c
@@ -0,0 +1,1987 @@
+/*
+ * bcache setup/teardown code, and some metadata io - read a superblock and
+ * figure out what to do with it.
+ *
+ * Copyright 2010, 2011 Kent Overstreet <kent.overstreet@gmail.com>
+ * Copyright 2012 Google, Inc.
+ */
+
+#include "bcache.h"
+#include "btree.h"
+#include "debug.h"
+#include "request.h"
+
+#include <linux/buffer_head.h>
+#include <linux/debugfs.h>
+#include <linux/genhd.h>
+#include <linux/module.h>
+#include <linux/random.h>
+#include <linux/reboot.h>
+#include <linux/sysfs.h>
+
+MODULE_LICENSE("GPL");
+MODULE_AUTHOR("Kent Overstreet <kent.overstreet@gmail.com>");
+
+static const char bcache_magic[] = {
+       0xc6, 0x85, 0x73, 0xf6, 0x4e, 0x1a, 0x45, 0xca,
+       0x82, 0x65, 0xf5, 0x7f, 0x48, 0xba, 0x6d, 0x81
+};
+
+static const char invalid_uuid[] = {
+       0xa0, 0x3e, 0xf8, 0xed, 0x3e, 0xe1, 0xb8, 0x78,
+       0xc8, 0x50, 0xfc, 0x5e, 0xcb, 0x16, 0xcd, 0x99
+};
+
+/* Default is -1; we skip past it for struct cached_dev's cache mode */
+const char * const bch_cache_modes[] = {
+       "default",
+       "writethrough",
+       "writeback",
+       "writearound",
+       "none",
+       NULL
+};
+
+struct uuid_entry_v0 {
+       uint8_t         uuid[16];
+       uint8_t         label[32];
+       uint32_t        first_reg;
+       uint32_t        last_reg;
+       uint32_t        invalidated;
+       uint32_t        pad;
+};
+
+static struct kobject *bcache_kobj;
+struct mutex bch_register_lock;
+LIST_HEAD(bch_cache_sets);
+static LIST_HEAD(uncached_devices);
+
+static int bcache_major, bcache_minor;
+static wait_queue_head_t unregister_wait;
+struct workqueue_struct *bcache_wq;
+
+#define BTREE_MAX_PAGES                (256 * 1024 / PAGE_SIZE)
+
+static void bio_split_pool_free(struct bio_split_pool *p)
+{
+       if (p->bio_split_hook)
+               mempool_destroy(p->bio_split_hook);
+
+       if (p->bio_split)
+               bioset_free(p->bio_split);
+}
+
+static int bio_split_pool_init(struct bio_split_pool *p)
+{
+       p->bio_split = bioset_create(4, 0);
+       if (!p->bio_split)
+               return -ENOMEM;
+
+       p->bio_split_hook = mempool_create_kmalloc_pool(4,
+                               sizeof(struct bio_split_hook));
+       if (!p->bio_split_hook)
+               return -ENOMEM;
+
+       return 0;
+}
+
+/* Superblock */
+
+static const char *read_super(struct cache_sb *sb, struct block_device *bdev,
+                             struct page **res)
+{
+       const char *err;
+       struct cache_sb *s;
+       struct buffer_head *bh = __bread(bdev, 1, SB_SIZE);
+       unsigned i;
+
+       if (!bh)
+               return "IO error";
+
+       s = (struct cache_sb *) bh->b_data;
+
+       sb->offset              = le64_to_cpu(s->offset);
+       sb->version             = le64_to_cpu(s->version);
+
+       memcpy(sb->magic,       s->magic, 16);
+       memcpy(sb->uuid,        s->uuid, 16);
+       memcpy(sb->set_uuid,    s->set_uuid, 16);
+       memcpy(sb->label,       s->label, SB_LABEL_SIZE);
+
+       sb->flags               = le64_to_cpu(s->flags);
+       sb->seq                 = le64_to_cpu(s->seq);
+       sb->last_mount          = le32_to_cpu(s->last_mount);
+       sb->first_bucket        = le16_to_cpu(s->first_bucket);
+       sb->keys                = le16_to_cpu(s->keys);
+
+       for (i = 0; i < SB_JOURNAL_BUCKETS; i++)
+               sb->d[i] = le64_to_cpu(s->d[i]);
+
+       pr_debug("read sb version %llu, flags %llu, seq %llu, journal size %u",
+                sb->version, sb->flags, sb->seq, sb->keys);
+
+       err = "Not a bcache superblock";
+       if (sb->offset != SB_SECTOR)
+               goto err;
+
+       if (memcmp(sb->magic, bcache_magic, 16))
+               goto err;
+
+       err = "Too many journal buckets";
+       if (sb->keys > SB_JOURNAL_BUCKETS)
+               goto err;
+
+       err = "Bad checksum";
+       if (s->csum != csum_set(s))
+               goto err;
+
+       err = "Bad UUID";
+       if (bch_is_zero(sb->uuid, 16))
+               goto err;
+
+       sb->block_size  = le16_to_cpu(s->block_size);
+
+       err = "Superblock block size smaller than device block size";
+       if (sb->block_size << 9 < bdev_logical_block_size(bdev))
+               goto err;
+
+       switch (sb->version) {
+       case BCACHE_SB_VERSION_BDEV:
+               sb->data_offset = BDEV_DATA_START_DEFAULT;
+               break;
+       case BCACHE_SB_VERSION_BDEV_WITH_OFFSET:
+               sb->data_offset = le64_to_cpu(s->data_offset);
+
+               err = "Bad data offset";
+               if (sb->data_offset < BDEV_DATA_START_DEFAULT)
+                       goto err;
+
+               break;
+       case BCACHE_SB_VERSION_CDEV:
+       case BCACHE_SB_VERSION_CDEV_WITH_UUID:
+               sb->nbuckets    = le64_to_cpu(s->nbuckets);
+               sb->block_size  = le16_to_cpu(s->block_size);
+               sb->bucket_size = le16_to_cpu(s->bucket_size);
+
+               sb->nr_in_set   = le16_to_cpu(s->nr_in_set);
+               sb->nr_this_dev = le16_to_cpu(s->nr_this_dev);
+
+               err = "Too many buckets";
+               if (sb->nbuckets > LONG_MAX)
+                       goto err;
+
+               err = "Not enough buckets";
+               if (sb->nbuckets < 1 << 7)
+                       goto err;
+
+               err = "Bad block/bucket size";
+               if (!is_power_of_2(sb->block_size) ||
+                   sb->block_size > PAGE_SECTORS ||
+                   !is_power_of_2(sb->bucket_size) ||
+                   sb->bucket_size < PAGE_SECTORS)
+                       goto err;
+
+               err = "Invalid superblock: device too small";
+               if (get_capacity(bdev->bd_disk) < sb->bucket_size * sb->nbuckets)
+                       goto err;
+
+               err = "Bad UUID";
+               if (bch_is_zero(sb->set_uuid, 16))
+                       goto err;
+
+               err = "Bad cache device number in set";
+               if (!sb->nr_in_set ||
+                   sb->nr_in_set <= sb->nr_this_dev ||
+                   sb->nr_in_set > MAX_CACHES_PER_SET)
+                       goto err;
+
+               err = "Journal buckets not sequential";
+               for (i = 0; i < sb->keys; i++)
+                       if (sb->d[i] != sb->first_bucket + i)
+                               goto err;
+
+               err = "Too many journal buckets";
+               if (sb->first_bucket + sb->keys > sb->nbuckets)
+                       goto err;
+
+               err = "Invalid superblock: first bucket comes before end of super";
+               if (sb->first_bucket * sb->bucket_size < 16)
+                       goto err;
+
+               break;
+       default:
+               err = "Unsupported superblock version";
+               goto err;
+       }
+
+       sb->last_mount = get_seconds();
+       err = NULL;
+
+       get_page(bh->b_page);
+       *res = bh->b_page;
+err:
+       put_bh(bh);
+       return err;
+}
+
+static void write_bdev_super_endio(struct bio *bio, int error)
+{
+       struct cached_dev *dc = bio->bi_private;
+       /* XXX: error checking */
+
+       closure_put(&dc->sb_write.cl);
+}
+
+static void __write_super(struct cache_sb *sb, struct bio *bio)
+{
+       struct cache_sb *out = page_address(bio->bi_io_vec[0].bv_page);
+       unsigned i;
+
+       bio->bi_sector  = SB_SECTOR;
+       bio->bi_rw      = REQ_SYNC|REQ_META;
+       bio->bi_size    = SB_SIZE;
+       bch_bio_map(bio, NULL);
+
+       out->offset             = cpu_to_le64(sb->offset);
+       out->version            = cpu_to_le64(sb->version);
+
+       memcpy(out->uuid,       sb->uuid, 16);
+       memcpy(out->set_uuid,   sb->set_uuid, 16);
+       memcpy(out->label,      sb->label, SB_LABEL_SIZE);
+
+       out->flags              = cpu_to_le64(sb->flags);
+       out->seq                = cpu_to_le64(sb->seq);
+
+       out->last_mount         = cpu_to_le32(sb->last_mount);
+       out->first_bucket       = cpu_to_le16(sb->first_bucket);
+       out->keys               = cpu_to_le16(sb->keys);
+
+       for (i = 0; i < sb->keys; i++)
+               out->d[i] = cpu_to_le64(sb->d[i]);
+
+       out->csum = csum_set(out);
+
+       pr_debug("ver %llu, flags %llu, seq %llu",
+                sb->version, sb->flags, sb->seq);
+
+       submit_bio(REQ_WRITE, bio);
+}
+
+void bch_write_bdev_super(struct cached_dev *dc, struct closure *parent)
+{
+       struct closure *cl = &dc->sb_write.cl;
+       struct bio *bio = &dc->sb_bio;
+
+       closure_lock(&dc->sb_write, parent);
+
+       bio_reset(bio);
+       bio->bi_bdev    = dc->bdev;
+       bio->bi_end_io  = write_bdev_super_endio;
+       bio->bi_private = dc;
+
+       closure_get(cl);
+       __write_super(&dc->sb, bio);
+
+       closure_return(cl);
+}
+
+static void write_super_endio(struct bio *bio, int error)
+{
+       struct cache *ca = bio->bi_private;
+
+       bch_count_io_errors(ca, error, "writing superblock");
+       closure_put(&ca->set->sb_write.cl);
+}
+
+void bcache_write_super(struct cache_set *c)
+{
+       struct closure *cl = &c->sb_write.cl;
+       struct cache *ca;
+       unsigned i;
+
+       closure_lock(&c->sb_write, &c->cl);
+
+       c->sb.seq++;
+
+       for_each_cache(ca, c, i) {
+               struct bio *bio = &ca->sb_bio;
+
+               ca->sb.version          = BCACHE_SB_VERSION_CDEV_WITH_UUID;
+               ca->sb.seq              = c->sb.seq;
+               ca->sb.last_mount       = c->sb.last_mount;
+
+               SET_CACHE_SYNC(&ca->sb, CACHE_SYNC(&c->sb));
+
+               bio_reset(bio);
+               bio->bi_bdev    = ca->bdev;
+               bio->bi_end_io  = write_super_endio;
+               bio->bi_private = ca;
+
+               closure_get(cl);
+               __write_super(&ca->sb, bio);
+       }
+
+       closure_return(cl);
+}
+
+/* UUID io */
+
+static void uuid_endio(struct bio *bio, int error)
+{
+       struct closure *cl = bio->bi_private;
+       struct cache_set *c = container_of(cl, struct cache_set, uuid_write.cl);
+
+       cache_set_err_on(error, c, "accessing uuids");
+       bch_bbio_free(bio, c);
+       closure_put(cl);
+}
+
+static void uuid_io(struct cache_set *c, unsigned long rw,
+                   struct bkey *k, struct closure *parent)
+{
+       struct closure *cl = &c->uuid_write.cl;
+       struct uuid_entry *u;
+       unsigned i;
+
+       BUG_ON(!parent);
+       closure_lock(&c->uuid_write, parent);
+
+       for (i = 0; i < KEY_PTRS(k); i++) {
+               struct bio *bio = bch_bbio_alloc(c);
+
+               bio->bi_rw      = REQ_SYNC|REQ_META|rw;
+               bio->bi_size    = KEY_SIZE(k) << 9;
+
+               bio->bi_end_io  = uuid_endio;
+               bio->bi_private = cl;
+               bch_bio_map(bio, c->uuids);
+
+               bch_submit_bbio(bio, c, k, i);
+
+               if (!(rw & WRITE))
+                       break;
+       }
+
+       pr_debug("%s UUIDs at %s", rw & REQ_WRITE ? "wrote" : "read",
+                pkey(&c->uuid_bucket));
+
+       for (u = c->uuids; u < c->uuids + c->nr_uuids; u++)
+               if (!bch_is_zero(u->uuid, 16))
+                       pr_debug("Slot %zi: %pU: %s: 1st: %u last: %u inv: %u",
+                                u - c->uuids, u->uuid, u->label,
+                                u->first_reg, u->last_reg, u->invalidated);
+
+       closure_return(cl);
+}
+
+static char *uuid_read(struct cache_set *c, struct jset *j, struct closure *cl)
+{
+       struct bkey *k = &j->uuid_bucket;
+
+       if (__bch_ptr_invalid(c, 1, k))
+               return "bad uuid pointer";
+
+       bkey_copy(&c->uuid_bucket, k);
+       uuid_io(c, READ_SYNC, k, cl);
+
+       if (j->version < BCACHE_JSET_VERSION_UUIDv1) {
+               struct uuid_entry_v0    *u0 = (void *) c->uuids;
+               struct uuid_entry       *u1 = (void *) c->uuids;
+               int i;
+
+               closure_sync(cl);
+
+               /*
+                * Since the new uuid entry is bigger than the old, we have to
+                * convert starting at the highest memory address and work down
+                * in order to do it in place
+                */
+
+               for (i = c->nr_uuids - 1;
+                    i >= 0;
+                    --i) {
+                       memcpy(u1[i].uuid,      u0[i].uuid, 16);
+                       memcpy(u1[i].label,     u0[i].label, 32);
+
+                       u1[i].first_reg         = u0[i].first_reg;
+                       u1[i].last_reg          = u0[i].last_reg;
+                       u1[i].invalidated       = u0[i].invalidated;
+
+                       u1[i].flags     = 0;
+                       u1[i].sectors   = 0;
+               }
+       }
+
+       return NULL;
+}
+
+static int __uuid_write(struct cache_set *c)
+{
+       BKEY_PADDED(key) k;
+       struct closure cl;
+       closure_init_stack(&cl);
+
+       lockdep_assert_held(&bch_register_lock);
+
+       if (bch_bucket_alloc_set(c, WATERMARK_METADATA, &k.key, 1, &cl))
+               return 1;
+
+       SET_KEY_SIZE(&k.key, c->sb.bucket_size);
+       uuid_io(c, REQ_WRITE, &k.key, &cl);
+       closure_sync(&cl);
+
+       bkey_copy(&c->uuid_bucket, &k.key);
+       __bkey_put(c, &k.key);
+       return 0;
+}
+
+int bch_uuid_write(struct cache_set *c)
+{
+       int ret = __uuid_write(c);
+
+       if (!ret)
+               bch_journal_meta(c, NULL);
+
+       return ret;
+}
+
+static struct uuid_entry *uuid_find(struct cache_set *c, const char *uuid)
+{
+       struct uuid_entry *u;
+
+       for (u = c->uuids;
+            u < c->uuids + c->nr_uuids; u++)
+               if (!memcmp(u->uuid, uuid, 16))
+                       return u;
+
+       return NULL;
+}
+
+static struct uuid_entry *uuid_find_empty(struct cache_set *c)
+{
+       static const char zero_uuid[16] = "\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0";
+       return uuid_find(c, zero_uuid);
+}
+
+/*
+ * Bucket priorities/gens:
+ *
+ * For each bucket, we store on disk its
+   * 8 bit gen
+   * 16 bit priority
+ *
+ * See alloc.c for an explanation of the gen. The priority is used to implement
+ * lru (and in the future other) cache replacement policies; for most purposes
+ * it's just an opaque integer.
+ *
+ * The gens and the priorities don't have a whole lot to do with each other, and
+ * it's actually the gens that must be written out at specific times - it's no
+ * big deal if the priorities don't get written, if we lose them we just reuse
+ * buckets in suboptimal order.
+ *
+ * On disk they're stored in a packed array, and in as many buckets are required
+ * to fit them all. The buckets we use to store them form a list; the journal
+ * header points to the first bucket, the first bucket points to the second
+ * bucket, et cetera.
+ *
+ * This code is used by the allocation code; periodically (whenever it runs out
+ * of buckets to allocate from) the allocation code will invalidate some
+ * buckets, but it can't use those buckets until their new gens are safely on
+ * disk.
+ */
+
+static void prio_endio(struct bio *bio, int error)
+{
+       struct cache *ca = bio->bi_private;
+
+       cache_set_err_on(error, ca->set, "accessing priorities");
+       bch_bbio_free(bio, ca->set);
+       closure_put(&ca->prio);
+}
+
+static void prio_io(struct cache *ca, uint64_t bucket, unsigned long rw)
+{
+       struct closure *cl = &ca->prio;
+       struct bio *bio = bch_bbio_alloc(ca->set);
+
+       closure_init_stack(cl);
+
+       bio->bi_sector  = bucket * ca->sb.bucket_size;
+       bio->bi_bdev    = ca->bdev;
+       bio->bi_rw      = REQ_SYNC|REQ_META|rw;
+       bio->bi_size    = bucket_bytes(ca);
+
+       bio->bi_end_io  = prio_endio;
+       bio->bi_private = ca;
+       bch_bio_map(bio, ca->disk_buckets);
+
+       closure_bio_submit(bio, &ca->prio, ca);
+       closure_sync(cl);
+}
+
+#define buckets_free(c)        "free %zu, free_inc %zu, unused %zu",           \
+       fifo_used(&c->free), fifo_used(&c->free_inc), fifo_used(&c->unused)
+
+void bch_prio_write(struct cache *ca)
+{
+       int i;
+       struct bucket *b;
+       struct closure cl;
+
+       closure_init_stack(&cl);
+
+       lockdep_assert_held(&ca->set->bucket_lock);
+
+       for (b = ca->buckets;
+            b < ca->buckets + ca->sb.nbuckets; b++)
+               b->disk_gen = b->gen;
+
+       ca->disk_buckets->seq++;
+
+       atomic_long_add(ca->sb.bucket_size * prio_buckets(ca),
+                       &ca->meta_sectors_written);
+
+       pr_debug("free %zu, free_inc %zu, unused %zu", fifo_used(&ca->free),
+                fifo_used(&ca->free_inc), fifo_used(&ca->unused));
+       blktrace_msg(ca, "Starting priorities: " buckets_free(ca));
+
+       for (i = prio_buckets(ca) - 1; i >= 0; --i) {
+               long bucket;
+               struct prio_set *p = ca->disk_buckets;
+               struct bucket_disk *d = p->data;
+               struct bucket_disk *end = d + prios_per_bucket(ca);
+
+               for (b = ca->buckets + i * prios_per_bucket(ca);
+                    b < ca->buckets + ca->sb.nbuckets && d < end;
+                    b++, d++) {
+                       d->prio = cpu_to_le16(b->prio);
+                       d->gen = b->gen;
+               }
+
+               p->next_bucket  = ca->prio_buckets[i + 1];
+               p->magic        = pset_magic(ca);
+               p->csum         = bch_crc64(&p->magic, bucket_bytes(ca) - 8);
+
+               bucket = bch_bucket_alloc(ca, WATERMARK_PRIO, &cl);
+               BUG_ON(bucket == -1);
+
+               mutex_unlock(&ca->set->bucket_lock);
+               prio_io(ca, bucket, REQ_WRITE);
+               mutex_lock(&ca->set->bucket_lock);
+
+               ca->prio_buckets[i] = bucket;
+               atomic_dec_bug(&ca->buckets[bucket].pin);
+       }
+
+       mutex_unlock(&ca->set->bucket_lock);
+
+       bch_journal_meta(ca->set, &cl);
+       closure_sync(&cl);
+
+       mutex_lock(&ca->set->bucket_lock);
+
+       ca->need_save_prio = 0;
+
+       /*
+        * Don't want the old priorities to get garbage collected until after we
+        * finish writing the new ones, and they're journalled
+        */
+       for (i = 0; i < prio_buckets(ca); i++)
+               ca->prio_last_buckets[i] = ca->prio_buckets[i];
+}
+
+static void prio_read(struct cache *ca, uint64_t bucket)
+{
+       struct prio_set *p = ca->disk_buckets;
+       struct bucket_disk *d = p->data + prios_per_bucket(ca), *end = d;
+       struct bucket *b;
+       unsigned bucket_nr = 0;
+
+       for (b = ca->buckets;
+            b < ca->buckets + ca->sb.nbuckets;
+            b++, d++) {
+               if (d == end) {
+                       ca->prio_buckets[bucket_nr] = bucket;
+                       ca->prio_last_buckets[bucket_nr] = bucket;
+                       bucket_nr++;
+
+                       prio_io(ca, bucket, READ_SYNC);
+
+                       if (p->csum != bch_crc64(&p->magic, bucket_bytes(ca) - 8))
+                               pr_warn("bad csum reading priorities");
+
+                       if (p->magic != pset_magic(ca))
+                               pr_warn("bad magic reading priorities");
+
+                       bucket = p->next_bucket;
+                       d = p->data;
+               }
+
+               b->prio = le16_to_cpu(d->prio);
+               b->gen = b->disk_gen = b->last_gc = b->gc_gen = d->gen;
+       }
+}
+
+/* Bcache device */
+
+static int open_dev(struct block_device *b, fmode_t mode)
+{
+       struct bcache_device *d = b->bd_disk->private_data;
+       if (atomic_read(&d->closing))
+               return -ENXIO;
+
+       closure_get(&d->cl);
+       return 0;
+}
+
+static int release_dev(struct gendisk *b, fmode_t mode)
+{
+       struct bcache_device *d = b->private_data;
+       closure_put(&d->cl);
+       return 0;
+}
+
+static int ioctl_dev(struct block_device *b, fmode_t mode,
+                    unsigned int cmd, unsigned long arg)
+{
+       struct bcache_device *d = b->bd_disk->private_data;
+       return d->ioctl(d, mode, cmd, arg);
+}
+
+static const struct block_device_operations bcache_ops = {
+       .open           = open_dev,
+       .release        = release_dev,
+       .ioctl          = ioctl_dev,
+       .owner          = THIS_MODULE,
+};
+
+void bcache_device_stop(struct bcache_device *d)
+{
+       if (!atomic_xchg(&d->closing, 1))
+               closure_queue(&d->cl);
+}
+
+static void bcache_device_unlink(struct bcache_device *d)
+{
+       unsigned i;
+       struct cache *ca;
+
+       sysfs_remove_link(&d->c->kobj, d->name);
+       sysfs_remove_link(&d->kobj, "cache");
+
+       for_each_cache(ca, d->c, i)
+               bd_unlink_disk_holder(ca->bdev, d->disk);
+}
+
+static void bcache_device_link(struct bcache_device *d, struct cache_set *c,
+                              const char *name)
+{
+       unsigned i;
+       struct cache *ca;
+
+       for_each_cache(ca, d->c, i)
+               bd_link_disk_holder(ca->bdev, d->disk);
+
+       snprintf(d->name, BCACHEDEVNAME_SIZE,
+                "%s%u", name, d->id);
+
+       WARN(sysfs_create_link(&d->kobj, &c->kobj, "cache") ||
+            sysfs_create_link(&c->kobj, &d->kobj, d->name),
+            "Couldn't create device <-> cache set symlinks");
+}
+
+static void bcache_device_detach(struct bcache_device *d)
+{
+       lockdep_assert_held(&bch_register_lock);
+
+       if (atomic_read(&d->detaching)) {
+               struct uuid_entry *u = d->c->uuids + d->id;
+
+               SET_UUID_FLASH_ONLY(u, 0);
+               memcpy(u->uuid, invalid_uuid, 16);
+               u->invalidated = cpu_to_le32(get_seconds());
+               bch_uuid_write(d->c);
+
+               atomic_set(&d->detaching, 0);
+       }
+
+       bcache_device_unlink(d);
+
+       d->c->devices[d->id] = NULL;
+       closure_put(&d->c->caching);
+       d->c = NULL;
+}
+
+static void bcache_device_attach(struct bcache_device *d, struct cache_set *c,
+                                unsigned id)
+{
+       BUG_ON(test_bit(CACHE_SET_STOPPING, &c->flags));
+
+       d->id = id;
+       d->c = c;
+       c->devices[id] = d;
+
+       closure_get(&c->caching);
+}
+
+static void bcache_device_free(struct bcache_device *d)
+{
+       lockdep_assert_held(&bch_register_lock);
+
+       pr_info("%s stopped", d->disk->disk_name);
+
+       if (d->c)
+               bcache_device_detach(d);
+
+       if (d->disk)
+               del_gendisk(d->disk);
+       if (d->disk && d->disk->queue)
+               blk_cleanup_queue(d->disk->queue);
+       if (d->disk)
+               put_disk(d->disk);
+
+       bio_split_pool_free(&d->bio_split_hook);
+       if (d->unaligned_bvec)
+               mempool_destroy(d->unaligned_bvec);
+       if (d->bio_split)
+               bioset_free(d->bio_split);
+
+       closure_debug_destroy(&d->cl);
+}
+
+static int bcache_device_init(struct bcache_device *d, unsigned block_size)
+{
+       struct request_queue *q;
+
+       if (!(d->bio_split = bioset_create(4, offsetof(struct bbio, bio))) ||
+           !(d->unaligned_bvec = mempool_create_kmalloc_pool(1,
+                               sizeof(struct bio_vec) * BIO_MAX_PAGES)) ||
+           bio_split_pool_init(&d->bio_split_hook))
+
+               return -ENOMEM;
+
+       d->disk = alloc_disk(1);
+       if (!d->disk)
+               return -ENOMEM;
+
+       snprintf(d->disk->disk_name, DISK_NAME_LEN, "bcache%i", bcache_minor);
+
+       d->disk->major          = bcache_major;
+       d->disk->first_minor    = bcache_minor++;
+       d->disk->fops           = &bcache_ops;
+       d->disk->private_data   = d;
+
+       q = blk_alloc_queue(GFP_KERNEL);
+       if (!q)
+               return -ENOMEM;
+
+       blk_queue_make_request(q, NULL);
+       d->disk->queue                  = q;
+       q->queuedata                    = d;
+       q->backing_dev_info.congested_data = d;
+       q->limits.max_hw_sectors        = UINT_MAX;
+       q->limits.max_sectors           = UINT_MAX;
+       q->limits.max_segment_size      = UINT_MAX;
+       q->limits.max_segments          = BIO_MAX_PAGES;
+       q->limits.max_discard_sectors   = UINT_MAX;
+       q->limits.io_min                = block_size;
+       q->limits.logical_block_size    = block_size;
+       q->limits.physical_block_size   = block_size;
+       set_bit(QUEUE_FLAG_NONROT,      &d->disk->queue->queue_flags);
+       set_bit(QUEUE_FLAG_DISCARD,     &d->disk->queue->queue_flags);
+
+       return 0;
+}
+
+/* Cached device */
+
+static void calc_cached_dev_sectors(struct cache_set *c)
+{
+       uint64_t sectors = 0;
+       struct cached_dev *dc;
+
+       list_for_each_entry(dc, &c->cached_devs, list)
+               sectors += bdev_sectors(dc->bdev);
+
+       c->cached_dev_sectors = sectors;
+}
+
+void bch_cached_dev_run(struct cached_dev *dc)
+{
+       struct bcache_device *d = &dc->disk;
+
+       if (atomic_xchg(&dc->running, 1))
+               return;
+
+       if (!d->c &&
+           BDEV_STATE(&dc->sb) != BDEV_STATE_NONE) {
+               struct closure cl;
+               closure_init_stack(&cl);
+
+               SET_BDEV_STATE(&dc->sb, BDEV_STATE_STALE);
+               bch_write_bdev_super(dc, &cl);
+               closure_sync(&cl);
+       }
+
+       add_disk(d->disk);
+       bd_link_disk_holder(dc->bdev, dc->disk.disk);
+#if 0
+       char *env[] = { "SYMLINK=label" , NULL };
+       kobject_uevent_env(&disk_to_dev(d->disk)->kobj, KOBJ_CHANGE, env);
+#endif
+       if (sysfs_create_link(&d->kobj, &disk_to_dev(d->disk)->kobj, "dev") ||
+           sysfs_create_link(&disk_to_dev(d->disk)->kobj, &d->kobj, "bcache"))
+               pr_debug("error creating sysfs link");
+}
+
+static void cached_dev_detach_finish(struct work_struct *w)
+{
+       struct cached_dev *dc = container_of(w, struct cached_dev, detach);
+       char buf[BDEVNAME_SIZE];
+       struct closure cl;
+       closure_init_stack(&cl);
+
+       BUG_ON(!atomic_read(&dc->disk.detaching));
+       BUG_ON(atomic_read(&dc->count));
+
+       mutex_lock(&bch_register_lock);
+
+       memset(&dc->sb.set_uuid, 0, 16);
+       SET_BDEV_STATE(&dc->sb, BDEV_STATE_NONE);
+
+       bch_write_bdev_super(dc, &cl);
+       closure_sync(&cl);
+
+       bcache_device_detach(&dc->disk);
+       list_move(&dc->list, &uncached_devices);
+
+       mutex_unlock(&bch_register_lock);
+
+       pr_info("Caching disabled for %s", bdevname(dc->bdev, buf));
+
+       /* Drop ref we took in cached_dev_detach() */
+       closure_put(&dc->disk.cl);
+}
+
+void bch_cached_dev_detach(struct cached_dev *dc)
+{
+       lockdep_assert_held(&bch_register_lock);
+
+       if (atomic_read(&dc->disk.closing))
+               return;
+
+       if (atomic_xchg(&dc->disk.detaching, 1))
+               return;
+
+       /*
+        * Block the device from being closed and freed until we're finished
+        * detaching
+        */
+       closure_get(&dc->disk.cl);
+
+       bch_writeback_queue(dc);
+       cached_dev_put(dc);
+}
+
+int bch_cached_dev_attach(struct cached_dev *dc, struct cache_set *c)
+{
+       uint32_t rtime = cpu_to_le32(get_seconds());
+       struct uuid_entry *u;
+       char buf[BDEVNAME_SIZE];
+
+       bdevname(dc->bdev, buf);
+
+       if (memcmp(dc->sb.set_uuid, c->sb.set_uuid, 16))
+               return -ENOENT;
+
+       if (dc->disk.c) {
+               pr_err("Can't attach %s: already attached", buf);
+               return -EINVAL;
+       }
+
+       if (test_bit(CACHE_SET_STOPPING, &c->flags)) {
+               pr_err("Can't attach %s: shutting down", buf);
+               return -EINVAL;
+       }
+
+       if (dc->sb.block_size < c->sb.block_size) {
+               /* Will die */
+               pr_err("Couldn't attach %s: block size less than set's block size",
+                      buf);
+               return -EINVAL;
+       }
+
+       u = uuid_find(c, dc->sb.uuid);
+
+       if (u &&
+           (BDEV_STATE(&dc->sb) == BDEV_STATE_STALE ||
+            BDEV_STATE(&dc->sb) == BDEV_STATE_NONE)) {
+               memcpy(u->uuid, invalid_uuid, 16);
+               u->invalidated = cpu_to_le32(get_seconds());
+               u = NULL;
+       }
+
+       if (!u) {
+               if (BDEV_STATE(&dc->sb) == BDEV_STATE_DIRTY) {
+                       pr_err("Couldn't find uuid for %s in set", buf);
+                       return -ENOENT;
+               }
+
+               u = uuid_find_empty(c);
+               if (!u) {
+                       pr_err("Not caching %s, no room for UUID", buf);
+                       return -EINVAL;
+               }
+       }
+
+       /* Deadlocks since we're called via sysfs...
+       sysfs_remove_file(&dc->kobj, &sysfs_attach);
+        */
+
+       if (bch_is_zero(u->uuid, 16)) {
+               struct closure cl;
+               closure_init_stack(&cl);
+
+               memcpy(u->uuid, dc->sb.uuid, 16);
+               memcpy(u->label, dc->sb.label, SB_LABEL_SIZE);
+               u->first_reg = u->last_reg = rtime;
+               bch_uuid_write(c);
+
+               memcpy(dc->sb.set_uuid, c->sb.set_uuid, 16);
+               SET_BDEV_STATE(&dc->sb, BDEV_STATE_CLEAN);
+
+               bch_write_bdev_super(dc, &cl);
+               closure_sync(&cl);
+       } else {
+               u->last_reg = rtime;
+               bch_uuid_write(c);
+       }
+
+       bcache_device_attach(&dc->disk, c, u - c->uuids);
+       list_move(&dc->list, &c->cached_devs);
+       calc_cached_dev_sectors(c);
+
+       smp_wmb();
+       /*
+        * dc->c must be set before dc->count != 0 - paired with the mb in
+        * cached_dev_get()
+        */
+       atomic_set(&dc->count, 1);
+
+       if (BDEV_STATE(&dc->sb) == BDEV_STATE_DIRTY) {
+               atomic_set(&dc->has_dirty, 1);
+               atomic_inc(&dc->count);
+               bch_writeback_queue(dc);
+       }
+
+       bch_cached_dev_run(dc);
+       bcache_device_link(&dc->disk, c, "bdev");
+
+       pr_info("Caching %s as %s on set %pU",
+               bdevname(dc->bdev, buf), dc->disk.disk->disk_name,
+               dc->disk.c->sb.set_uuid);
+       return 0;
+}
+
+void bch_cached_dev_release(struct kobject *kobj)
+{
+       struct cached_dev *dc = container_of(kobj, struct cached_dev,
+                                            disk.kobj);
+       kfree(dc);
+       module_put(THIS_MODULE);
+}
+
+static void cached_dev_free(struct closure *cl)
+{
+       struct cached_dev *dc = container_of(cl, struct cached_dev, disk.cl);
+
+       cancel_delayed_work_sync(&dc->writeback_rate_update);
+
+       mutex_lock(&bch_register_lock);
+
+       bd_unlink_disk_holder(dc->bdev, dc->disk.disk);
+       bcache_device_free(&dc->disk);
+       list_del(&dc->list);
+
+       mutex_unlock(&bch_register_lock);
+
+       if (!IS_ERR_OR_NULL(dc->bdev)) {
+               blk_sync_queue(bdev_get_queue(dc->bdev));
+               blkdev_put(dc->bdev, FMODE_READ|FMODE_WRITE|FMODE_EXCL);
+       }
+
+       wake_up(&unregister_wait);
+
+       kobject_put(&dc->disk.kobj);
+}
+
+static void cached_dev_flush(struct closure *cl)
+{
+       struct cached_dev *dc = container_of(cl, struct cached_dev, disk.cl);
+       struct bcache_device *d = &dc->disk;
+
+       bch_cache_accounting_destroy(&dc->accounting);
+       kobject_del(&d->kobj);
+
+       continue_at(cl, cached_dev_free, system_wq);
+}
+
+static int cached_dev_init(struct cached_dev *dc, unsigned block_size)
+{
+       int err;
+       struct io *io;
+
+       closure_init(&dc->disk.cl, NULL);
+       set_closure_fn(&dc->disk.cl, cached_dev_flush, system_wq);
+
+       __module_get(THIS_MODULE);
+       INIT_LIST_HEAD(&dc->list);
+       kobject_init(&dc->disk.kobj, &bch_cached_dev_ktype);
+
+       bch_cache_accounting_init(&dc->accounting, &dc->disk.cl);
+
+       err = bcache_device_init(&dc->disk, block_size);
+       if (err)
+               goto err;
+
+       spin_lock_init(&dc->io_lock);
+       closure_init_unlocked(&dc->sb_write);
+       INIT_WORK(&dc->detach, cached_dev_detach_finish);
+
+       dc->sequential_merge            = true;
+       dc->sequential_cutoff           = 4 << 20;
+
+       INIT_LIST_HEAD(&dc->io_lru);
+       dc->sb_bio.bi_max_vecs  = 1;
+       dc->sb_bio.bi_io_vec    = dc->sb_bio.bi_inline_vecs;
+
+       for (io = dc->io; io < dc->io + RECENT_IO; io++) {
+               list_add(&io->lru, &dc->io_lru);
+               hlist_add_head(&io->hash, dc->io_hash + RECENT_IO);
+       }
+
+       bch_writeback_init_cached_dev(dc);
+       return 0;
+err:
+       bcache_device_stop(&dc->disk);
+       return err;
+}
+
+/* Cached device - bcache superblock */
+
+static const char *register_bdev(struct cache_sb *sb, struct page *sb_page,
+                                struct block_device *bdev,
+                                struct cached_dev *dc)
+{
+       char name[BDEVNAME_SIZE];
+       const char *err = "cannot allocate memory";
+       struct gendisk *g;
+       struct cache_set *c;
+
+       if (!dc || cached_dev_init(dc, sb->block_size << 9) != 0)
+               return err;
+
+       memcpy(&dc->sb, sb, sizeof(struct cache_sb));
+       dc->sb_bio.bi_io_vec[0].bv_page = sb_page;
+       dc->bdev = bdev;
+       dc->bdev->bd_holder = dc;
+
+       g = dc->disk.disk;
+
+       set_capacity(g, dc->bdev->bd_part->nr_sects - dc->sb.data_offset);
+
+       g->queue->backing_dev_info.ra_pages =
+               max(g->queue->backing_dev_info.ra_pages,
+                   bdev->bd_queue->backing_dev_info.ra_pages);
+
+       bch_cached_dev_request_init(dc);
+
+       err = "error creating kobject";
+       if (kobject_add(&dc->disk.kobj, &part_to_dev(bdev->bd_part)->kobj,
+                       "bcache"))
+               goto err;
+       if (bch_cache_accounting_add_kobjs(&dc->accounting, &dc->disk.kobj))
+               goto err;
+
+       list_add(&dc->list, &uncached_devices);
+       list_for_each_entry(c, &bch_cache_sets, list)
+               bch_cached_dev_attach(dc, c);
+
+       if (BDEV_STATE(&dc->sb) == BDEV_STATE_NONE ||
+           BDEV_STATE(&dc->sb) == BDEV_STATE_STALE)
+               bch_cached_dev_run(dc);
+
+       return NULL;
+err:
+       kobject_put(&dc->disk.kobj);
+       pr_notice("error opening %s: %s", bdevname(bdev, name), err);
+       /*
+        * Return NULL instead of an error because kobject_put() cleans
+        * everything up
+        */
+       return NULL;
+}
+
+/* Flash only volumes */
+
+void bch_flash_dev_release(struct kobject *kobj)
+{
+       struct bcache_device *d = container_of(kobj, struct bcache_device,
+                                              kobj);
+       kfree(d);
+}
+
+static void flash_dev_free(struct closure *cl)
+{
+       struct bcache_device *d = container_of(cl, struct bcache_device, cl);
+       bcache_device_free(d);
+       kobject_put(&d->kobj);
+}
+
+static void flash_dev_flush(struct closure *cl)
+{
+       struct bcache_device *d = container_of(cl, struct bcache_device, cl);
+
+       bcache_device_unlink(d);
+       kobject_del(&d->kobj);
+       continue_at(cl, flash_dev_free, system_wq);
+}
+
+static int flash_dev_run(struct cache_set *c, struct uuid_entry *u)
+{
+       struct bcache_device *d = kzalloc(sizeof(struct bcache_device),
+                                         GFP_KERNEL);
+       if (!d)
+               return -ENOMEM;
+
+       closure_init(&d->cl, NULL);
+       set_closure_fn(&d->cl, flash_dev_flush, system_wq);
+
+       kobject_init(&d->kobj, &bch_flash_dev_ktype);
+
+       if (bcache_device_init(d, block_bytes(c)))
+               goto err;
+
+       bcache_device_attach(d, c, u - c->uuids);
+       set_capacity(d->disk, u->sectors);
+       bch_flash_dev_request_init(d);
+       add_disk(d->disk);
+
+       if (kobject_add(&d->kobj, &disk_to_dev(d->disk)->kobj, "bcache"))
+               goto err;
+
+       bcache_device_link(d, c, "volume");
+
+       return 0;
+err:
+       kobject_put(&d->kobj);
+       return -ENOMEM;
+}
+
+static int flash_devs_run(struct cache_set *c)
+{
+       int ret = 0;
+       struct uuid_entry *u;
+
+       for (u = c->uuids;
+            u < c->uuids + c->nr_uuids && !ret;
+            u++)
+               if (UUID_FLASH_ONLY(u))
+                       ret = flash_dev_run(c, u);
+
+       return ret;
+}
+
+int bch_flash_dev_create(struct cache_set *c, uint64_t size)
+{
+       struct uuid_entry *u;
+
+       if (test_bit(CACHE_SET_STOPPING, &c->flags))
+               return -EINTR;
+
+       u = uuid_find_empty(c);
+       if (!u) {
+               pr_err("Can't create volume, no room for UUID");
+               return -EINVAL;
+       }
+
+       get_random_bytes(u->uuid, 16);
+       memset(u->label, 0, 32);
+       u->first_reg = u->last_reg = cpu_to_le32(get_seconds());
+
+       SET_UUID_FLASH_ONLY(u, 1);
+       u->sectors = size >> 9;
+
+       bch_uuid_write(c);
+
+       return flash_dev_run(c, u);
+}
+
+/* Cache set */
+
+__printf(2, 3)
+bool bch_cache_set_error(struct cache_set *c, const char *fmt, ...)
+{
+       va_list args;
+
+       if (test_bit(CACHE_SET_STOPPING, &c->flags))
+               return false;
+
+       /* XXX: we can be called from atomic context
+       acquire_console_sem();
+       */
+
+       printk(KERN_ERR "bcache: error on %pU: ", c->sb.set_uuid);
+
+       va_start(args, fmt);
+       vprintk(fmt, args);
+       va_end(args);
+
+       printk(", disabling caching\n");
+
+       bch_cache_set_unregister(c);
+       return true;
+}
+
+void bch_cache_set_release(struct kobject *kobj)
+{
+       struct cache_set *c = container_of(kobj, struct cache_set, kobj);
+       kfree(c);
+       module_put(THIS_MODULE);
+}
+
+static void cache_set_free(struct closure *cl)
+{
+       struct cache_set *c = container_of(cl, struct cache_set, cl);
+       struct cache *ca;
+       unsigned i;
+
+       if (!IS_ERR_OR_NULL(c->debug))
+               debugfs_remove(c->debug);
+
+       bch_open_buckets_free(c);
+       bch_btree_cache_free(c);
+       bch_journal_free(c);
+
+       for_each_cache(ca, c, i)
+               if (ca)
+                       kobject_put(&ca->kobj);
+
+       free_pages((unsigned long) c->uuids, ilog2(bucket_pages(c)));
+       free_pages((unsigned long) c->sort, ilog2(bucket_pages(c)));
+
+       kfree(c->fill_iter);
+       if (c->bio_split)
+               bioset_free(c->bio_split);
+       if (c->bio_meta)
+               mempool_destroy(c->bio_meta);
+       if (c->search)
+               mempool_destroy(c->search);
+       kfree(c->devices);
+
+       mutex_lock(&bch_register_lock);
+       list_del(&c->list);
+       mutex_unlock(&bch_register_lock);
+
+       pr_info("Cache set %pU unregistered", c->sb.set_uuid);
+       wake_up(&unregister_wait);
+
+       closure_debug_destroy(&c->cl);
+       kobject_put(&c->kobj);
+}
+
+static void cache_set_flush(struct closure *cl)
+{
+       struct cache_set *c = container_of(cl, struct cache_set, caching);
+       struct btree *b;
+
+       /* Shut down allocator threads */
+       set_bit(CACHE_SET_STOPPING_2, &c->flags);
+       wake_up(&c->alloc_wait);
+
+       bch_cache_accounting_destroy(&c->accounting);
+
+       kobject_put(&c->internal);
+       kobject_del(&c->kobj);
+
+       if (!IS_ERR_OR_NULL(c->root))
+               list_add(&c->root->list, &c->btree_cache);
+
+       /* Should skip this if we're unregistering because of an error */
+       list_for_each_entry(b, &c->btree_cache, list)
+               if (btree_node_dirty(b))
+                       bch_btree_write(b, true, NULL);
+
+       closure_return(cl);
+}
+
+static void __cache_set_unregister(struct closure *cl)
+{
+       struct cache_set *c = container_of(cl, struct cache_set, caching);
+       struct cached_dev *dc, *t;
+       size_t i;
+
+       mutex_lock(&bch_register_lock);
+
+       if (test_bit(CACHE_SET_UNREGISTERING, &c->flags))
+               list_for_each_entry_safe(dc, t, &c->cached_devs, list)
+                       bch_cached_dev_detach(dc);
+
+       for (i = 0; i < c->nr_uuids; i++)
+               if (c->devices[i] && UUID_FLASH_ONLY(&c->uuids[i]))
+                       bcache_device_stop(c->devices[i]);
+
+       mutex_unlock(&bch_register_lock);
+
+       continue_at(cl, cache_set_flush, system_wq);
+}
+
+void bch_cache_set_stop(struct cache_set *c)
+{
+       if (!test_and_set_bit(CACHE_SET_STOPPING, &c->flags))
+               closure_queue(&c->caching);
+}
+
+void bch_cache_set_unregister(struct cache_set *c)
+{
+       set_bit(CACHE_SET_UNREGISTERING, &c->flags);
+       bch_cache_set_stop(c);
+}
+
+#define alloc_bucket_pages(gfp, c)                     \
+       ((void *) __get_free_pages(__GFP_ZERO|gfp, ilog2(bucket_pages(c))))
+
+struct cache_set *bch_cache_set_alloc(struct cache_sb *sb)
+{
+       int iter_size;
+       struct cache_set *c = kzalloc(sizeof(struct cache_set), GFP_KERNEL);
+       if (!c)
+               return NULL;
+
+       __module_get(THIS_MODULE);
+       closure_init(&c->cl, NULL);
+       set_closure_fn(&c->cl, cache_set_free, system_wq);
+
+       closure_init(&c->caching, &c->cl);
+       set_closure_fn(&c->caching, __cache_set_unregister, system_wq);
+
+       /* Maybe create continue_at_noreturn() and use it here? */
+       closure_set_stopped(&c->cl);
+       closure_put(&c->cl);
+
+       kobject_init(&c->kobj, &bch_cache_set_ktype);
+       kobject_init(&c->internal, &bch_cache_set_internal_ktype);
+
+       bch_cache_accounting_init(&c->accounting, &c->cl);
+
+       memcpy(c->sb.set_uuid, sb->set_uuid, 16);
+       c->sb.block_size        = sb->block_size;
+       c->sb.bucket_size       = sb->bucket_size;
+       c->sb.nr_in_set         = sb->nr_in_set;
+       c->sb.last_mount        = sb->last_mount;
+       c->bucket_bits          = ilog2(sb->bucket_size);
+       c->block_bits           = ilog2(sb->block_size);
+       c->nr_uuids             = bucket_bytes(c) / sizeof(struct uuid_entry);
+
+       c->btree_pages          = c->sb.bucket_size / PAGE_SECTORS;
+       if (c->btree_pages > BTREE_MAX_PAGES)
+               c->btree_pages = max_t(int, c->btree_pages / 4,
+                                      BTREE_MAX_PAGES);
+
+       init_waitqueue_head(&c->alloc_wait);
+       mutex_init(&c->bucket_lock);
+       mutex_init(&c->fill_lock);
+       mutex_init(&c->sort_lock);
+       spin_lock_init(&c->sort_time_lock);
+       closure_init_unlocked(&c->sb_write);
+       closure_init_unlocked(&c->uuid_write);
+       spin_lock_init(&c->btree_read_time_lock);
+       bch_moving_init_cache_set(c);
+
+       INIT_LIST_HEAD(&c->list);
+       INIT_LIST_HEAD(&c->cached_devs);
+       INIT_LIST_HEAD(&c->btree_cache);
+       INIT_LIST_HEAD(&c->btree_cache_freeable);
+       INIT_LIST_HEAD(&c->btree_cache_freed);
+       INIT_LIST_HEAD(&c->data_buckets);
+
+       c->search = mempool_create_slab_pool(32, bch_search_cache);
+       if (!c->search)
+               goto err;
+
+       iter_size = (sb->bucket_size / sb->block_size + 1) *
+               sizeof(struct btree_iter_set);
+
+       if (!(c->devices = kzalloc(c->nr_uuids * sizeof(void *), GFP_KERNEL)) ||
+           !(c->bio_meta = mempool_create_kmalloc_pool(2,
+                               sizeof(struct bbio) + sizeof(struct bio_vec) *
+                               bucket_pages(c))) ||
+           !(c->bio_split = bioset_create(4, offsetof(struct bbio, bio))) ||
+           !(c->fill_iter = kmalloc(iter_size, GFP_KERNEL)) ||
+           !(c->sort = alloc_bucket_pages(GFP_KERNEL, c)) ||
+           !(c->uuids = alloc_bucket_pages(GFP_KERNEL, c)) ||
+           bch_journal_alloc(c) ||
+           bch_btree_cache_alloc(c) ||
+           bch_open_buckets_alloc(c))
+               goto err;
+
+       c->fill_iter->size = sb->bucket_size / sb->block_size;
+
+       c->congested_read_threshold_us  = 2000;
+       c->congested_write_threshold_us = 20000;
+       c->error_limit  = 8 << IO_ERROR_SHIFT;
+
+       return c;
+err:
+       bch_cache_set_unregister(c);
+       return NULL;
+}
+
+static void run_cache_set(struct cache_set *c)
+{
+       const char *err = "cannot allocate memory";
+       struct cached_dev *dc, *t;
+       struct cache *ca;
+       unsigned i;
+
+       struct btree_op op;
+       bch_btree_op_init_stack(&op);
+       op.lock = SHRT_MAX;
+
+       for_each_cache(ca, c, i)
+               c->nbuckets += ca->sb.nbuckets;
+
+       if (CACHE_SYNC(&c->sb)) {
+               LIST_HEAD(journal);
+               struct bkey *k;
+               struct jset *j;
+
+               err = "cannot allocate memory for journal";
+               if (bch_journal_read(c, &journal, &op))
+                       goto err;
+
+               pr_debug("btree_journal_read() done");
+
+               err = "no journal entries found";
+               if (list_empty(&journal))
+                       goto err;
+
+               j = &list_entry(journal.prev, struct journal_replay, list)->j;
+
+               err = "IO error reading priorities";
+               for_each_cache(ca, c, i)
+                       prio_read(ca, j->prio_bucket[ca->sb.nr_this_dev]);
+
+               /*
+                * If prio_read() fails it'll call cache_set_error and we'll
+                * tear everything down right away, but if we perhaps checked
+                * sooner we could avoid journal replay.
+                */
+
+               k = &j->btree_root;
+
+               err = "bad btree root";
+               if (__bch_ptr_invalid(c, j->btree_level + 1, k))
+                       goto err;
+
+               err = "error reading btree root";
+               c->root = bch_btree_node_get(c, k, j->btree_level, &op);
+               if (IS_ERR_OR_NULL(c->root))
+                       goto err;
+
+               list_del_init(&c->root->list);
+               rw_unlock(true, c->root);
+
+               err = uuid_read(c, j, &op.cl);
+               if (err)
+                       goto err;
+
+               err = "error in recovery";
+               if (bch_btree_check(c, &op))
+                       goto err;
+
+               bch_journal_mark(c, &journal);
+               bch_btree_gc_finish(c);
+               pr_debug("btree_check() done");
+
+               /*
+                * bcache_journal_next() can't happen sooner, or
+                * btree_gc_finish() will give spurious errors about last_gc >
+                * gc_gen - this is a hack but oh well.
+                */
+               bch_journal_next(&c->journal);
+
+               for_each_cache(ca, c, i)
+                       closure_call(&ca->alloc, bch_allocator_thread,
+                                    system_wq, &c->cl);
+
+               /*
+                * First place it's safe to allocate: btree_check() and
+                * btree_gc_finish() have to run before we have buckets to
+                * allocate, and bch_bucket_alloc_set() might cause a journal
+                * entry to be written so bcache_journal_next() has to be called
+                * first.
+                *
+                * If the uuids were in the old format we have to rewrite them
+                * before the next journal entry is written:
+                */
+               if (j->version < BCACHE_JSET_VERSION_UUID)
+                       __uuid_write(c);
+
+               bch_journal_replay(c, &journal, &op);
+       } else {
+               pr_notice("invalidating existing data");
+               /* Don't want invalidate_buckets() to queue a gc yet */
+               closure_lock(&c->gc, NULL);
+
+               for_each_cache(ca, c, i) {
+                       unsigned j;
+
+                       ca->sb.keys = clamp_t(int, ca->sb.nbuckets >> 7,
+                                             2, SB_JOURNAL_BUCKETS);
+
+                       for (j = 0; j < ca->sb.keys; j++)
+                               ca->sb.d[j] = ca->sb.first_bucket + j;
+               }
+
+               bch_btree_gc_finish(c);
+
+               for_each_cache(ca, c, i)
+                       closure_call(&ca->alloc, bch_allocator_thread,
+                                    ca->alloc_workqueue, &c->cl);
+
+               mutex_lock(&c->bucket_lock);
+               for_each_cache(ca, c, i)
+                       bch_prio_write(ca);
+               mutex_unlock(&c->bucket_lock);
+
+               wake_up(&c->alloc_wait);
+
+               err = "cannot allocate new UUID bucket";
+               if (__uuid_write(c))
+                       goto err_unlock_gc;
+
+               err = "cannot allocate new btree root";
+               c->root = bch_btree_node_alloc(c, 0, &op.cl);
+               if (IS_ERR_OR_NULL(c->root))
+                       goto err_unlock_gc;
+
+               bkey_copy_key(&c->root->key, &MAX_KEY);
+               bch_btree_write(c->root, true, &op);
+
+               bch_btree_set_root(c->root);
+               rw_unlock(true, c->root);
+
+               /*
+                * We don't want to write the first journal entry until
+                * everything is set up - fortunately journal entries won't be
+                * written until the SET_CACHE_SYNC() here:
+                */
+               SET_CACHE_SYNC(&c->sb, true);
+
+               bch_journal_next(&c->journal);
+               bch_journal_meta(c, &op.cl);
+
+               /* Unlock */
+               closure_set_stopped(&c->gc.cl);
+               closure_put(&c->gc.cl);
+       }
+
+       closure_sync(&op.cl);
+       c->sb.last_mount = get_seconds();
+       bcache_write_super(c);
+
+       list_for_each_entry_safe(dc, t, &uncached_devices, list)
+               bch_cached_dev_attach(dc, c);
+
+       flash_devs_run(c);
+
+       return;
+err_unlock_gc:
+       closure_set_stopped(&c->gc.cl);
+       closure_put(&c->gc.cl);
+err:
+       closure_sync(&op.cl);
+       /* XXX: test this, it's broken */
+       bch_cache_set_error(c, err);
+}
+
+static bool can_attach_cache(struct cache *ca, struct cache_set *c)
+{
+       return ca->sb.block_size        == c->sb.block_size &&
+               ca->sb.bucket_size      == c->sb.block_size &&
+               ca->sb.nr_in_set        == c->sb.nr_in_set;
+}
+
+static const char *register_cache_set(struct cache *ca)
+{
+       char buf[12];
+       const char *err = "cannot allocate memory";
+       struct cache_set *c;
+
+       list_for_each_entry(c, &bch_cache_sets, list)
+               if (!memcmp(c->sb.set_uuid, ca->sb.set_uuid, 16)) {
+                       if (c->cache[ca->sb.nr_this_dev])
+                               return "duplicate cache set member";
+
+                       if (!can_attach_cache(ca, c))
+                               return "cache sb does not match set";
+
+                       if (!CACHE_SYNC(&ca->sb))
+                               SET_CACHE_SYNC(&c->sb, false);
+
+                       goto found;
+               }
+
+       c = bch_cache_set_alloc(&ca->sb);
+       if (!c)
+               return err;
+
+       err = "error creating kobject";
+       if (kobject_add(&c->kobj, bcache_kobj, "%pU", c->sb.set_uuid) ||
+           kobject_add(&c->internal, &c->kobj, "internal"))
+               goto err;
+
+       if (bch_cache_accounting_add_kobjs(&c->accounting, &c->kobj))
+               goto err;
+
+       bch_debug_init_cache_set(c);
+
+       list_add(&c->list, &bch_cache_sets);
+found:
+       sprintf(buf, "cache%i", ca->sb.nr_this_dev);
+       if (sysfs_create_link(&ca->kobj, &c->kobj, "set") ||
+           sysfs_create_link(&c->kobj, &ca->kobj, buf))
+               goto err;
+
+       if (ca->sb.seq > c->sb.seq) {
+               c->sb.version           = ca->sb.version;
+               memcpy(c->sb.set_uuid, ca->sb.set_uuid, 16);
+               c->sb.flags             = ca->sb.flags;
+               c->sb.seq               = ca->sb.seq;
+               pr_debug("set version = %llu", c->sb.version);
+       }
+
+       ca->set = c;
+       ca->set->cache[ca->sb.nr_this_dev] = ca;
+       c->cache_by_alloc[c->caches_loaded++] = ca;
+
+       if (c->caches_loaded == c->sb.nr_in_set)
+               run_cache_set(c);
+
+       return NULL;
+err:
+       bch_cache_set_unregister(c);
+       return err;
+}
+
+/* Cache device */
+
+void bch_cache_release(struct kobject *kobj)
+{
+       struct cache *ca = container_of(kobj, struct cache, kobj);
+
+       if (ca->set)
+               ca->set->cache[ca->sb.nr_this_dev] = NULL;
+
+       bch_cache_allocator_exit(ca);
+
+       bio_split_pool_free(&ca->bio_split_hook);
+
+       if (ca->alloc_workqueue)
+               destroy_workqueue(ca->alloc_workqueue);
+
+       free_pages((unsigned long) ca->disk_buckets, ilog2(bucket_pages(ca)));
+       kfree(ca->prio_buckets);
+       vfree(ca->buckets);
+
+       free_heap(&ca->heap);
+       free_fifo(&ca->unused);
+       free_fifo(&ca->free_inc);
+       free_fifo(&ca->free);
+
+       if (ca->sb_bio.bi_inline_vecs[0].bv_page)
+               put_page(ca->sb_bio.bi_io_vec[0].bv_page);
+
+       if (!IS_ERR_OR_NULL(ca->bdev)) {
+               blk_sync_queue(bdev_get_queue(ca->bdev));
+               blkdev_put(ca->bdev, FMODE_READ|FMODE_WRITE|FMODE_EXCL);
+       }
+
+       kfree(ca);
+       module_put(THIS_MODULE);
+}
+
+static int cache_alloc(struct cache_sb *sb, struct cache *ca)
+{
+       size_t free;
+       struct bucket *b;
+
+       if (!ca)
+               return -ENOMEM;
+
+       __module_get(THIS_MODULE);
+       kobject_init(&ca->kobj, &bch_cache_ktype);
+
+       memcpy(&ca->sb, sb, sizeof(struct cache_sb));
+
+       INIT_LIST_HEAD(&ca->discards);
+
+       bio_init(&ca->sb_bio);
+       ca->sb_bio.bi_max_vecs  = 1;
+       ca->sb_bio.bi_io_vec    = ca->sb_bio.bi_inline_vecs;
+
+       bio_init(&ca->journal.bio);
+       ca->journal.bio.bi_max_vecs = 8;
+       ca->journal.bio.bi_io_vec = ca->journal.bio.bi_inline_vecs;
+
+       free = roundup_pow_of_two(ca->sb.nbuckets) >> 9;
+       free = max_t(size_t, free, (prio_buckets(ca) + 8) * 2);
+
+       if (!init_fifo(&ca->free,       free, GFP_KERNEL) ||
+           !init_fifo(&ca->free_inc,   free << 2, GFP_KERNEL) ||
+           !init_fifo(&ca->unused,     free << 2, GFP_KERNEL) ||
+           !init_heap(&ca->heap,       free << 3, GFP_KERNEL) ||
+           !(ca->buckets       = vmalloc(sizeof(struct bucket) *
+                                         ca->sb.nbuckets)) ||
+           !(ca->prio_buckets  = kzalloc(sizeof(uint64_t) * prio_buckets(ca) *
+                                         2, GFP_KERNEL)) ||
+           !(ca->disk_buckets  = alloc_bucket_pages(GFP_KERNEL, ca)) ||
+           !(ca->alloc_workqueue = alloc_workqueue("bch_allocator", 0, 1)) ||
+           bio_split_pool_init(&ca->bio_split_hook))
+               goto err;
+
+       ca->prio_last_buckets = ca->prio_buckets + prio_buckets(ca);
+
+       memset(ca->buckets, 0, ca->sb.nbuckets * sizeof(struct bucket));
+       for_each_bucket(b, ca)
+               atomic_set(&b->pin, 0);
+
+       if (bch_cache_allocator_init(ca))
+               goto err;
+
+       return 0;
+err:
+       kobject_put(&ca->kobj);
+       return -ENOMEM;
+}
+
+static const char *register_cache(struct cache_sb *sb, struct page *sb_page,
+                                 struct block_device *bdev, struct cache *ca)
+{
+       char name[BDEVNAME_SIZE];
+       const char *err = "cannot allocate memory";
+
+       if (cache_alloc(sb, ca) != 0)
+               return err;
+
+       ca->sb_bio.bi_io_vec[0].bv_page = sb_page;
+       ca->bdev = bdev;
+       ca->bdev->bd_holder = ca;
+
+       if (blk_queue_discard(bdev_get_queue(ca->bdev)))
+               ca->discard = CACHE_DISCARD(&ca->sb);
+
+       err = "error creating kobject";
+       if (kobject_add(&ca->kobj, &part_to_dev(bdev->bd_part)->kobj, "bcache"))
+               goto err;
+
+       err = register_cache_set(ca);
+       if (err)
+               goto err;
+
+       pr_info("registered cache device %s", bdevname(bdev, name));
+
+       return NULL;
+err:
+       kobject_put(&ca->kobj);
+       pr_info("error opening %s: %s", bdevname(bdev, name), err);
+       /* Return NULL instead of an error because kobject_put() cleans
+        * everything up
+        */
+       return NULL;
+}
+
+/* Global interfaces/init */
+
+static ssize_t register_bcache(struct kobject *, struct kobj_attribute *,
+                              const char *, size_t);
+
+kobj_attribute_write(register,         register_bcache);
+kobj_attribute_write(register_quiet,   register_bcache);
+
+static ssize_t register_bcache(struct kobject *k, struct kobj_attribute *attr,
+                              const char *buffer, size_t size)
+{
+       ssize_t ret = size;
+       const char *err = "cannot allocate memory";
+       char *path = NULL;
+       struct cache_sb *sb = NULL;
+       struct block_device *bdev = NULL;
+       struct page *sb_page = NULL;
+
+       if (!try_module_get(THIS_MODULE))
+               return -EBUSY;
+
+       mutex_lock(&bch_register_lock);
+
+       if (!(path = kstrndup(buffer, size, GFP_KERNEL)) ||
+           !(sb = kmalloc(sizeof(struct cache_sb), GFP_KERNEL)))
+               goto err;
+
+       err = "failed to open device";
+       bdev = blkdev_get_by_path(strim(path),
+                                 FMODE_READ|FMODE_WRITE|FMODE_EXCL,
+                                 sb);
+       if (bdev == ERR_PTR(-EBUSY))
+               err = "device busy";
+
+       if (IS_ERR(bdev) ||
+           set_blocksize(bdev, 4096))
+               goto err;
+
+       err = read_super(sb, bdev, &sb_page);
+       if (err)
+               goto err_close;
+
+       if (SB_IS_BDEV(sb)) {
+               struct cached_dev *dc = kzalloc(sizeof(*dc), GFP_KERNEL);
+
+               err = register_bdev(sb, sb_page, bdev, dc);
+       } else {
+               struct cache *ca = kzalloc(sizeof(*ca), GFP_KERNEL);
+
+               err = register_cache(sb, sb_page, bdev, ca);
+       }
+
+       if (err) {
+               /* register_(bdev|cache) will only return an error if they
+                * didn't get far enough to create the kobject - if they did,
+                * the kobject destructor will do this cleanup.
+                */
+               put_page(sb_page);
+err_close:
+               blkdev_put(bdev, FMODE_READ|FMODE_WRITE|FMODE_EXCL);
+err:
+               if (attr != &ksysfs_register_quiet)
+                       pr_info("error opening %s: %s", path, err);
+               ret = -EINVAL;
+       }
+
+       kfree(sb);
+       kfree(path);
+       mutex_unlock(&bch_register_lock);
+       module_put(THIS_MODULE);
+       return ret;
+}
+
+static int bcache_reboot(struct notifier_block *n, unsigned long code, void *x)
+{
+       if (code == SYS_DOWN ||
+           code == SYS_HALT ||
+           code == SYS_POWER_OFF) {
+               DEFINE_WAIT(wait);
+               unsigned long start = jiffies;
+               bool stopped = false;
+
+               struct cache_set *c, *tc;
+               struct cached_dev *dc, *tdc;
+
+               mutex_lock(&bch_register_lock);
+
+               if (list_empty(&bch_cache_sets) &&
+                   list_empty(&uncached_devices))
+                       goto out;
+
+               pr_info("Stopping all devices:");
+
+               list_for_each_entry_safe(c, tc, &bch_cache_sets, list)
+                       bch_cache_set_stop(c);
+
+               list_for_each_entry_safe(dc, tdc, &uncached_devices, list)
+                       bcache_device_stop(&dc->disk);
+
+               /* What's a condition variable? */
+               while (1) {
+                       long timeout = start + 2 * HZ - jiffies;
+
+                       stopped = list_empty(&bch_cache_sets) &&
+                               list_empty(&uncached_devices);
+
+                       if (timeout < 0 || stopped)
+                               break;
+
+                       prepare_to_wait(&unregister_wait, &wait,
+                                       TASK_UNINTERRUPTIBLE);
+
+                       mutex_unlock(&bch_register_lock);
+                       schedule_timeout(timeout);
+                       mutex_lock(&bch_register_lock);
+               }
+
+               finish_wait(&unregister_wait, &wait);
+
+               if (stopped)
+                       pr_info("All devices stopped");
+               else
+                       pr_notice("Timeout waiting for devices to be closed");
+out:
+               mutex_unlock(&bch_register_lock);
+       }
+
+       return NOTIFY_DONE;
+}
+
+static struct notifier_block reboot = {
+       .notifier_call  = bcache_reboot,
+       .priority       = INT_MAX, /* before any real devices */
+};
+
+static void bcache_exit(void)
+{
+       bch_debug_exit();
+       bch_writeback_exit();
+       bch_request_exit();
+       bch_btree_exit();
+       if (bcache_kobj)
+               kobject_put(bcache_kobj);
+       if (bcache_wq)
+               destroy_workqueue(bcache_wq);
+       unregister_blkdev(bcache_major, "bcache");
+       unregister_reboot_notifier(&reboot);
+}
+
+static int __init bcache_init(void)
+{
+       static const struct attribute *files[] = {
+               &ksysfs_register.attr,
+               &ksysfs_register_quiet.attr,
+               NULL
+       };
+
+       mutex_init(&bch_register_lock);
+       init_waitqueue_head(&unregister_wait);
+       register_reboot_notifier(&reboot);
+       closure_debug_init();
+
+       bcache_major = register_blkdev(0, "bcache");
+       if (bcache_major < 0)
+               return bcache_major;
+
+       if (!(bcache_wq = create_workqueue("bcache")) ||
+           !(bcache_kobj = kobject_create_and_add("bcache", fs_kobj)) ||
+           sysfs_create_files(bcache_kobj, files) ||
+           bch_btree_init() ||
+           bch_request_init() ||
+           bch_writeback_init() ||
+           bch_debug_init(bcache_kobj))
+               goto err;
+
+       return 0;
+err:
+       bcache_exit();
+       return -ENOMEM;
+}
+
+module_exit(bcache_exit);
+module_init(bcache_init);
diff --git a/drivers/md/bcache/sysfs.c b/drivers/md/bcache/sysfs.c

new file mode 100644 (file)

index 0000000..4d9cca4
--- /dev/null
+++ b/drivers/md/bcache/sysfs.c
@@ -0,0 +1,817 @@
+/*
+ * bcache sysfs interfaces
+ *
+ * Copyright 2010, 2011 Kent Overstreet <kent.overstreet@gmail.com>
+ * Copyright 2012 Google, Inc.
+ */
+
+#include "bcache.h"
+#include "sysfs.h"
+#include "btree.h"
+#include "request.h"
+
+#include <linux/sort.h>
+
+static const char * const cache_replacement_policies[] = {
+       "lru",
+       "fifo",
+       "random",
+       NULL
+};
+
+write_attribute(attach);
+write_attribute(detach);
+write_attribute(unregister);
+write_attribute(stop);
+write_attribute(clear_stats);
+write_attribute(trigger_gc);
+write_attribute(prune_cache);
+write_attribute(flash_vol_create);
+
+read_attribute(bucket_size);
+read_attribute(block_size);
+read_attribute(nbuckets);
+read_attribute(tree_depth);
+read_attribute(root_usage_percent);
+read_attribute(priority_stats);
+read_attribute(btree_cache_size);
+read_attribute(btree_cache_max_chain);
+read_attribute(cache_available_percent);
+read_attribute(written);
+read_attribute(btree_written);
+read_attribute(metadata_written);
+read_attribute(active_journal_entries);
+
+sysfs_time_stats_attribute(btree_gc,   sec, ms);
+sysfs_time_stats_attribute(btree_split, sec, us);
+sysfs_time_stats_attribute(btree_sort, ms,  us);
+sysfs_time_stats_attribute(btree_read, ms,  us);
+sysfs_time_stats_attribute(try_harder, ms,  us);
+
+read_attribute(btree_nodes);
+read_attribute(btree_used_percent);
+read_attribute(average_key_size);
+read_attribute(dirty_data);
+read_attribute(bset_tree_stats);
+
+read_attribute(state);
+read_attribute(cache_read_races);
+read_attribute(writeback_keys_done);
+read_attribute(writeback_keys_failed);
+read_attribute(io_errors);
+read_attribute(congested);
+rw_attribute(congested_read_threshold_us);
+rw_attribute(congested_write_threshold_us);
+
+rw_attribute(sequential_cutoff);
+rw_attribute(sequential_merge);
+rw_attribute(data_csum);
+rw_attribute(cache_mode);
+rw_attribute(writeback_metadata);
+rw_attribute(writeback_running);
+rw_attribute(writeback_percent);
+rw_attribute(writeback_delay);
+rw_attribute(writeback_rate);
+
+rw_attribute(writeback_rate_update_seconds);
+rw_attribute(writeback_rate_d_term);
+rw_attribute(writeback_rate_p_term_inverse);
+rw_attribute(writeback_rate_d_smooth);
+read_attribute(writeback_rate_debug);
+
+rw_attribute(synchronous);
+rw_attribute(journal_delay_ms);
+rw_attribute(discard);
+rw_attribute(running);
+rw_attribute(label);
+rw_attribute(readahead);
+rw_attribute(io_error_limit);
+rw_attribute(io_error_halflife);
+rw_attribute(verify);
+rw_attribute(key_merging_disabled);
+rw_attribute(gc_always_rewrite);
+rw_attribute(freelist_percent);
+rw_attribute(cache_replacement_policy);
+rw_attribute(btree_shrinker_disabled);
+rw_attribute(copy_gc_enabled);
+rw_attribute(size);
+
+SHOW(__bch_cached_dev)
+{
+       struct cached_dev *dc = container_of(kobj, struct cached_dev,
+                                            disk.kobj);
+       const char *states[] = { "no cache", "clean", "dirty", "inconsistent" };
+
+#define var(stat)              (dc->stat)
+
+       if (attr == &sysfs_cache_mode)
+               return bch_snprint_string_list(buf, PAGE_SIZE,
+                                              bch_cache_modes + 1,
+                                              BDEV_CACHE_MODE(&dc->sb));
+
+       sysfs_printf(data_csum,         "%i", dc->disk.data_csum);
+       var_printf(verify,              "%i");
+       var_printf(writeback_metadata,  "%i");
+       var_printf(writeback_running,   "%i");
+       var_print(writeback_delay);
+       var_print(writeback_percent);
+       sysfs_print(writeback_rate,     dc->writeback_rate.rate);
+
+       var_print(writeback_rate_update_seconds);
+       var_print(writeback_rate_d_term);
+       var_print(writeback_rate_p_term_inverse);
+       var_print(writeback_rate_d_smooth);
+
+       if (attr == &sysfs_writeback_rate_debug) {
+               char dirty[20];
+               char derivative[20];
+               char target[20];
+               bch_hprint(dirty,
+                      atomic_long_read(&dc->disk.sectors_dirty) << 9);
+               bch_hprint(derivative,  dc->writeback_rate_derivative << 9);
+               bch_hprint(target,      dc->writeback_rate_target << 9);
+
+               return sprintf(buf,
+                              "rate:\t\t%u\n"
+                              "change:\t\t%i\n"
+                              "dirty:\t\t%s\n"
+                              "derivative:\t%s\n"
+                              "target:\t\t%s\n",
+                              dc->writeback_rate.rate,
+                              dc->writeback_rate_change,
+                              dirty, derivative, target);
+       }
+
+       sysfs_hprint(dirty_data,
+                    atomic_long_read(&dc->disk.sectors_dirty) << 9);
+
+       var_printf(sequential_merge,    "%i");
+       var_hprint(sequential_cutoff);
+       var_hprint(readahead);
+
+       sysfs_print(running,            atomic_read(&dc->running));
+       sysfs_print(state,              states[BDEV_STATE(&dc->sb)]);
+
+       if (attr == &sysfs_label) {
+               memcpy(buf, dc->sb.label, SB_LABEL_SIZE);
+               buf[SB_LABEL_SIZE + 1] = '\0';
+               strcat(buf, "\n");
+               return strlen(buf);
+       }
+
+#undef var
+       return 0;
+}
+SHOW_LOCKED(bch_cached_dev)
+
+STORE(__cached_dev)
+{
+       struct cached_dev *dc = container_of(kobj, struct cached_dev,
+                                            disk.kobj);
+       unsigned v = size;
+       struct cache_set *c;
+
+#define d_strtoul(var)         sysfs_strtoul(var, dc->var)
+#define d_strtoi_h(var)                sysfs_hatoi(var, dc->var)
+
+       sysfs_strtoul(data_csum,        dc->disk.data_csum);
+       d_strtoul(verify);
+       d_strtoul(writeback_metadata);
+       d_strtoul(writeback_running);
+       d_strtoul(writeback_delay);
+       sysfs_strtoul_clamp(writeback_rate,
+                           dc->writeback_rate.rate, 1, 1000000);
+       sysfs_strtoul_clamp(writeback_percent, dc->writeback_percent, 0, 40);
+
+       d_strtoul(writeback_rate_update_seconds);
+       d_strtoul(writeback_rate_d_term);
+       d_strtoul(writeback_rate_p_term_inverse);
+       sysfs_strtoul_clamp(writeback_rate_p_term_inverse,
+                           dc->writeback_rate_p_term_inverse, 1, INT_MAX);
+       d_strtoul(writeback_rate_d_smooth);
+
+       d_strtoul(sequential_merge);
+       d_strtoi_h(sequential_cutoff);
+       d_strtoi_h(readahead);
+
+       if (attr == &sysfs_clear_stats)
+               bch_cache_accounting_clear(&dc->accounting);
+
+       if (attr == &sysfs_running &&
+           strtoul_or_return(buf))
+               bch_cached_dev_run(dc);
+
+       if (attr == &sysfs_cache_mode) {
+               ssize_t v = bch_read_string_list(buf, bch_cache_modes + 1);
+
+               if (v < 0)
+                       return v;
+
+               if ((unsigned) v != BDEV_CACHE_MODE(&dc->sb)) {
+                       SET_BDEV_CACHE_MODE(&dc->sb, v);
+                       bch_write_bdev_super(dc, NULL);
+               }
+       }
+
+       if (attr == &sysfs_label) {
+               memcpy(dc->sb.label, buf, SB_LABEL_SIZE);
+               bch_write_bdev_super(dc, NULL);
+               if (dc->disk.c) {
+                       memcpy(dc->disk.c->uuids[dc->disk.id].label,
+                              buf, SB_LABEL_SIZE);
+                       bch_uuid_write(dc->disk.c);
+               }
+       }
+
+       if (attr == &sysfs_attach) {
+               if (bch_parse_uuid(buf, dc->sb.set_uuid) < 16)
+                       return -EINVAL;
+
+               list_for_each_entry(c, &bch_cache_sets, list) {
+                       v = bch_cached_dev_attach(dc, c);
+                       if (!v)
+                               return size;
+               }
+
+               pr_err("Can't attach %s: cache set not found", buf);
+               size = v;
+       }
+
+       if (attr == &sysfs_detach && dc->disk.c)
+               bch_cached_dev_detach(dc);
+
+       if (attr == &sysfs_stop)
+               bcache_device_stop(&dc->disk);
+
+       return size;
+}
+
+STORE(bch_cached_dev)
+{
+       struct cached_dev *dc = container_of(kobj, struct cached_dev,
+                                            disk.kobj);
+
+       mutex_lock(&bch_register_lock);
+       size = __cached_dev_store(kobj, attr, buf, size);
+
+       if (attr == &sysfs_writeback_running)
+               bch_writeback_queue(dc);
+
+       if (attr == &sysfs_writeback_percent)
+               schedule_delayed_work(&dc->writeback_rate_update,
+                                     dc->writeback_rate_update_seconds * HZ);
+
+       mutex_unlock(&bch_register_lock);
+       return size;
+}
+
+static struct attribute *bch_cached_dev_files[] = {
+       &sysfs_attach,
+       &sysfs_detach,
+       &sysfs_stop,
+#if 0
+       &sysfs_data_csum,
+#endif
+       &sysfs_cache_mode,
+       &sysfs_writeback_metadata,
+       &sysfs_writeback_running,
+       &sysfs_writeback_delay,
+       &sysfs_writeback_percent,
+       &sysfs_writeback_rate,
+       &sysfs_writeback_rate_update_seconds,
+       &sysfs_writeback_rate_d_term,
+       &sysfs_writeback_rate_p_term_inverse,
+       &sysfs_writeback_rate_d_smooth,
+       &sysfs_writeback_rate_debug,
+       &sysfs_dirty_data,
+       &sysfs_sequential_cutoff,
+       &sysfs_sequential_merge,
+       &sysfs_clear_stats,
+       &sysfs_running,
+       &sysfs_state,
+       &sysfs_label,
+       &sysfs_readahead,
+#ifdef CONFIG_BCACHE_DEBUG
+       &sysfs_verify,
+#endif
+       NULL
+};
+KTYPE(bch_cached_dev);
+
+SHOW(bch_flash_dev)
+{
+       struct bcache_device *d = container_of(kobj, struct bcache_device,
+                                              kobj);
+       struct uuid_entry *u = &d->c->uuids[d->id];
+
+       sysfs_printf(data_csum, "%i", d->data_csum);
+       sysfs_hprint(size,      u->sectors << 9);
+
+       if (attr == &sysfs_label) {
+               memcpy(buf, u->label, SB_LABEL_SIZE);
+               buf[SB_LABEL_SIZE + 1] = '\0';
+               strcat(buf, "\n");
+               return strlen(buf);
+       }
+
+       return 0;
+}
+
+STORE(__bch_flash_dev)
+{
+       struct bcache_device *d = container_of(kobj, struct bcache_device,
+                                              kobj);
+       struct uuid_entry *u = &d->c->uuids[d->id];
+
+       sysfs_strtoul(data_csum,        d->data_csum);
+
+       if (attr == &sysfs_size) {
+               uint64_t v;
+               strtoi_h_or_return(buf, v);
+
+               u->sectors = v >> 9;
+               bch_uuid_write(d->c);
+               set_capacity(d->disk, u->sectors);
+       }
+
+       if (attr == &sysfs_label) {
+               memcpy(u->label, buf, SB_LABEL_SIZE);
+               bch_uuid_write(d->c);
+       }
+
+       if (attr == &sysfs_unregister) {
+               atomic_set(&d->detaching, 1);
+               bcache_device_stop(d);
+       }
+
+       return size;
+}
+STORE_LOCKED(bch_flash_dev)
+
+static struct attribute *bch_flash_dev_files[] = {
+       &sysfs_unregister,
+#if 0
+       &sysfs_data_csum,
+#endif
+       &sysfs_label,
+       &sysfs_size,
+       NULL
+};
+KTYPE(bch_flash_dev);
+
+SHOW(__bch_cache_set)
+{
+       unsigned root_usage(struct cache_set *c)
+       {
+               unsigned bytes = 0;
+               struct bkey *k;
+               struct btree *b;
+               struct btree_iter iter;
+
+               goto lock_root;
+
+               do {
+                       rw_unlock(false, b);
+lock_root:
+                       b = c->root;
+                       rw_lock(false, b, b->level);
+               } while (b != c->root);
+
+               for_each_key_filter(b, k, &iter, bch_ptr_bad)
+                       bytes += bkey_bytes(k);
+
+               rw_unlock(false, b);
+
+               return (bytes * 100) / btree_bytes(c);
+       }
+
+       size_t cache_size(struct cache_set *c)
+       {
+               size_t ret = 0;
+               struct btree *b;
+
+               mutex_lock(&c->bucket_lock);
+               list_for_each_entry(b, &c->btree_cache, list)
+                       ret += 1 << (b->page_order + PAGE_SHIFT);
+
+               mutex_unlock(&c->bucket_lock);
+               return ret;
+       }
+
+       unsigned cache_max_chain(struct cache_set *c)
+       {
+               unsigned ret = 0;
+               struct hlist_head *h;
+
+               mutex_lock(&c->bucket_lock);
+
+               for (h = c->bucket_hash;
+                    h < c->bucket_hash + (1 << BUCKET_HASH_BITS);
+                    h++) {
+                       unsigned i = 0;
+                       struct hlist_node *p;
+
+                       hlist_for_each(p, h)
+                               i++;
+
+                       ret = max(ret, i);
+               }
+
+               mutex_unlock(&c->bucket_lock);
+               return ret;
+       }
+
+       unsigned btree_used(struct cache_set *c)
+       {
+               return div64_u64(c->gc_stats.key_bytes * 100,
+                                (c->gc_stats.nodes ?: 1) * btree_bytes(c));
+       }
+
+       unsigned average_key_size(struct cache_set *c)
+       {
+               return c->gc_stats.nkeys
+                       ? div64_u64(c->gc_stats.data, c->gc_stats.nkeys)
+                       : 0;
+       }
+
+       struct cache_set *c = container_of(kobj, struct cache_set, kobj);
+
+       sysfs_print(synchronous,                CACHE_SYNC(&c->sb));
+       sysfs_print(journal_delay_ms,           c->journal_delay_ms);
+       sysfs_hprint(bucket_size,               bucket_bytes(c));
+       sysfs_hprint(block_size,                block_bytes(c));
+       sysfs_print(tree_depth,                 c->root->level);
+       sysfs_print(root_usage_percent,         root_usage(c));
+
+       sysfs_hprint(btree_cache_size,          cache_size(c));
+       sysfs_print(btree_cache_max_chain,      cache_max_chain(c));
+       sysfs_print(cache_available_percent,    100 - c->gc_stats.in_use);
+
+       sysfs_print_time_stats(&c->btree_gc_time,       btree_gc, sec, ms);
+       sysfs_print_time_stats(&c->btree_split_time,    btree_split, sec, us);
+       sysfs_print_time_stats(&c->sort_time,           btree_sort, ms, us);
+       sysfs_print_time_stats(&c->btree_read_time,     btree_read, ms, us);
+       sysfs_print_time_stats(&c->try_harder_time,     try_harder, ms, us);
+
+       sysfs_print(btree_used_percent, btree_used(c));
+       sysfs_print(btree_nodes,        c->gc_stats.nodes);
+       sysfs_hprint(dirty_data,        c->gc_stats.dirty);
+       sysfs_hprint(average_key_size,  average_key_size(c));
+
+       sysfs_print(cache_read_races,
+                   atomic_long_read(&c->cache_read_races));
+
+       sysfs_print(writeback_keys_done,
+                   atomic_long_read(&c->writeback_keys_done));
+       sysfs_print(writeback_keys_failed,
+                   atomic_long_read(&c->writeback_keys_failed));
+
+       /* See count_io_errors for why 88 */
+       sysfs_print(io_error_halflife,  c->error_decay * 88);
+       sysfs_print(io_error_limit,     c->error_limit >> IO_ERROR_SHIFT);
+
+       sysfs_hprint(congested,
+                    ((uint64_t) bch_get_congested(c)) << 9);
+       sysfs_print(congested_read_threshold_us,
+                   c->congested_read_threshold_us);
+       sysfs_print(congested_write_threshold_us,
+                   c->congested_write_threshold_us);
+
+       sysfs_print(active_journal_entries,     fifo_used(&c->journal.pin));
+       sysfs_printf(verify,                    "%i", c->verify);
+       sysfs_printf(key_merging_disabled,      "%i", c->key_merging_disabled);
+       sysfs_printf(gc_always_rewrite,         "%i", c->gc_always_rewrite);
+       sysfs_printf(btree_shrinker_disabled,   "%i", c->shrinker_disabled);
+       sysfs_printf(copy_gc_enabled,           "%i", c->copy_gc_enabled);
+
+       if (attr == &sysfs_bset_tree_stats)
+               return bch_bset_print_stats(c, buf);
+
+       return 0;
+}
+SHOW_LOCKED(bch_cache_set)
+
+STORE(__bch_cache_set)
+{
+       struct cache_set *c = container_of(kobj, struct cache_set, kobj);
+
+       if (attr == &sysfs_unregister)
+               bch_cache_set_unregister(c);
+
+       if (attr == &sysfs_stop)
+               bch_cache_set_stop(c);
+
+       if (attr == &sysfs_synchronous) {
+               bool sync = strtoul_or_return(buf);
+
+               if (sync != CACHE_SYNC(&c->sb)) {
+                       SET_CACHE_SYNC(&c->sb, sync);
+                       bcache_write_super(c);
+               }
+       }
+
+       if (attr == &sysfs_flash_vol_create) {
+               int r;
+               uint64_t v;
+               strtoi_h_or_return(buf, v);
+
+               r = bch_flash_dev_create(c, v);
+               if (r)
+                       return r;
+       }
+
+       if (attr == &sysfs_clear_stats) {
+               atomic_long_set(&c->writeback_keys_done,        0);
+               atomic_long_set(&c->writeback_keys_failed,      0);
+
+               memset(&c->gc_stats, 0, sizeof(struct gc_stat));
+               bch_cache_accounting_clear(&c->accounting);
+       }
+
+       if (attr == &sysfs_trigger_gc)
+               bch_queue_gc(c);
+
+       if (attr == &sysfs_prune_cache) {
+               struct shrink_control sc;
+               sc.gfp_mask = GFP_KERNEL;
+               sc.nr_to_scan = strtoul_or_return(buf);
+               c->shrink.shrink(&c->shrink, &sc);
+       }
+
+       sysfs_strtoul(congested_read_threshold_us,
+                     c->congested_read_threshold_us);
+       sysfs_strtoul(congested_write_threshold_us,
+                     c->congested_write_threshold_us);
+
+       if (attr == &sysfs_io_error_limit)
+               c->error_limit = strtoul_or_return(buf) << IO_ERROR_SHIFT;
+
+       /* See count_io_errors() for why 88 */
+       if (attr == &sysfs_io_error_halflife)
+               c->error_decay = strtoul_or_return(buf) / 88;
+
+       sysfs_strtoul(journal_delay_ms,         c->journal_delay_ms);
+       sysfs_strtoul(verify,                   c->verify);
+       sysfs_strtoul(key_merging_disabled,     c->key_merging_disabled);
+       sysfs_strtoul(gc_always_rewrite,        c->gc_always_rewrite);
+       sysfs_strtoul(btree_shrinker_disabled,  c->shrinker_disabled);
+       sysfs_strtoul(copy_gc_enabled,          c->copy_gc_enabled);
+
+       return size;
+}
+STORE_LOCKED(bch_cache_set)
+
+SHOW(bch_cache_set_internal)
+{
+       struct cache_set *c = container_of(kobj, struct cache_set, internal);
+       return bch_cache_set_show(&c->kobj, attr, buf);
+}
+
+STORE(bch_cache_set_internal)
+{
+       struct cache_set *c = container_of(kobj, struct cache_set, internal);
+       return bch_cache_set_store(&c->kobj, attr, buf, size);
+}
+
+static void bch_cache_set_internal_release(struct kobject *k)
+{
+}
+
+static struct attribute *bch_cache_set_files[] = {
+       &sysfs_unregister,
+       &sysfs_stop,
+       &sysfs_synchronous,
+       &sysfs_journal_delay_ms,
+       &sysfs_flash_vol_create,
+
+       &sysfs_bucket_size,
+       &sysfs_block_size,
+       &sysfs_tree_depth,
+       &sysfs_root_usage_percent,
+       &sysfs_btree_cache_size,
+       &sysfs_cache_available_percent,
+
+       &sysfs_average_key_size,
+       &sysfs_dirty_data,
+
+       &sysfs_io_error_limit,
+       &sysfs_io_error_halflife,
+       &sysfs_congested,
+       &sysfs_congested_read_threshold_us,
+       &sysfs_congested_write_threshold_us,
+       &sysfs_clear_stats,
+       NULL
+};
+KTYPE(bch_cache_set);
+
+static struct attribute *bch_cache_set_internal_files[] = {
+       &sysfs_active_journal_entries,
+
+       sysfs_time_stats_attribute_list(btree_gc, sec, ms)
+       sysfs_time_stats_attribute_list(btree_split, sec, us)
+       sysfs_time_stats_attribute_list(btree_sort, ms, us)
+       sysfs_time_stats_attribute_list(btree_read, ms, us)
+       sysfs_time_stats_attribute_list(try_harder, ms, us)
+
+       &sysfs_btree_nodes,
+       &sysfs_btree_used_percent,
+       &sysfs_btree_cache_max_chain,
+
+       &sysfs_bset_tree_stats,
+       &sysfs_cache_read_races,
+       &sysfs_writeback_keys_done,
+       &sysfs_writeback_keys_failed,
+
+       &sysfs_trigger_gc,
+       &sysfs_prune_cache,
+#ifdef CONFIG_BCACHE_DEBUG
+       &sysfs_verify,
+       &sysfs_key_merging_disabled,
+#endif
+       &sysfs_gc_always_rewrite,
+       &sysfs_btree_shrinker_disabled,
+       &sysfs_copy_gc_enabled,
+       NULL
+};
+KTYPE(bch_cache_set_internal);
+
+SHOW(__bch_cache)
+{
+       struct cache *ca = container_of(kobj, struct cache, kobj);
+
+       sysfs_hprint(bucket_size,       bucket_bytes(ca));
+       sysfs_hprint(block_size,        block_bytes(ca));
+       sysfs_print(nbuckets,           ca->sb.nbuckets);
+       sysfs_print(discard,            ca->discard);
+       sysfs_hprint(written, atomic_long_read(&ca->sectors_written) << 9);
+       sysfs_hprint(btree_written,
+                    atomic_long_read(&ca->btree_sectors_written) << 9);
+       sysfs_hprint(metadata_written,
+                    (atomic_long_read(&ca->meta_sectors_written) +
+                     atomic_long_read(&ca->btree_sectors_written)) << 9);
+
+       sysfs_print(io_errors,
+                   atomic_read(&ca->io_errors) >> IO_ERROR_SHIFT);
+
+       sysfs_print(freelist_percent, ca->free.size * 100 /
+                   ((size_t) ca->sb.nbuckets));
+
+       if (attr == &sysfs_cache_replacement_policy)
+               return bch_snprint_string_list(buf, PAGE_SIZE,
+                                              cache_replacement_policies,
+                                              CACHE_REPLACEMENT(&ca->sb));
+
+       if (attr == &sysfs_priority_stats) {
+               int cmp(const void *l, const void *r)
+               {       return *((uint16_t *) r) - *((uint16_t *) l); }
+
+               /* Number of quantiles we compute */
+               const unsigned nq = 31;
+
+               size_t n = ca->sb.nbuckets, i, unused, btree;
+               uint64_t sum = 0;
+               uint16_t q[nq], *p, *cached;
+               ssize_t ret;
+
+               cached = p = vmalloc(ca->sb.nbuckets * sizeof(uint16_t));
+               if (!p)
+                       return -ENOMEM;
+
+               mutex_lock(&ca->set->bucket_lock);
+               for (i = ca->sb.first_bucket; i < n; i++)
+                       p[i] = ca->buckets[i].prio;
+               mutex_unlock(&ca->set->bucket_lock);
+
+               sort(p, n, sizeof(uint16_t), cmp, NULL);
+
+               while (n &&
+                      !cached[n - 1])
+                       --n;
+
+               unused = ca->sb.nbuckets - n;
+
+               while (cached < p + n &&
+                      *cached == BTREE_PRIO)
+                       cached++;
+
+               btree = cached - p;
+               n -= btree;
+
+               for (i = 0; i < n; i++)
+                       sum += INITIAL_PRIO - cached[i];
+
+               if (n)
+                       do_div(sum, n);
+
+               for (i = 0; i < nq; i++)
+                       q[i] = INITIAL_PRIO - cached[n * (i + 1) / (nq + 1)];
+
+               vfree(p);
+
+               ret = snprintf(buf, PAGE_SIZE,
+                              "Unused:         %zu%%\n"
+                              "Metadata:       %zu%%\n"
+                              "Average:        %llu\n"
+                              "Sectors per Q:  %zu\n"
+                              "Quantiles:      [",
+                              unused * 100 / (size_t) ca->sb.nbuckets,
+                              btree * 100 / (size_t) ca->sb.nbuckets, sum,
+                              n * ca->sb.bucket_size / (nq + 1));
+
+               for (i = 0; i < nq && ret < (ssize_t) PAGE_SIZE; i++)
+                       ret += snprintf(buf + ret, PAGE_SIZE - ret,
+                                       i < nq - 1 ? "%u " : "%u]\n", q[i]);
+
+               buf[PAGE_SIZE - 1] = '\0';
+               return ret;
+       }
+
+       return 0;
+}
+SHOW_LOCKED(bch_cache)
+
+STORE(__bch_cache)
+{
+       struct cache *ca = container_of(kobj, struct cache, kobj);
+
+       if (attr == &sysfs_discard) {
+               bool v = strtoul_or_return(buf);
+
+               if (blk_queue_discard(bdev_get_queue(ca->bdev)))
+                       ca->discard = v;
+
+               if (v != CACHE_DISCARD(&ca->sb)) {
+                       SET_CACHE_DISCARD(&ca->sb, v);
+                       bcache_write_super(ca->set);
+               }
+       }
+
+       if (attr == &sysfs_cache_replacement_policy) {
+               ssize_t v = bch_read_string_list(buf, cache_replacement_policies);
+
+               if (v < 0)
+                       return v;
+
+               if ((unsigned) v != CACHE_REPLACEMENT(&ca->sb)) {
+                       mutex_lock(&ca->set->bucket_lock);
+                       SET_CACHE_REPLACEMENT(&ca->sb, v);
+                       mutex_unlock(&ca->set->bucket_lock);
+
+                       bcache_write_super(ca->set);
+               }
+       }
+
+       if (attr == &sysfs_freelist_percent) {
+               DECLARE_FIFO(long, free);
+               long i;
+               size_t p = strtoul_or_return(buf);
+
+               p = clamp_t(size_t,
+                           ((size_t) ca->sb.nbuckets * p) / 100,
+                           roundup_pow_of_two(ca->sb.nbuckets) >> 9,
+                           ca->sb.nbuckets / 2);
+
+               if (!init_fifo_exact(&free, p, GFP_KERNEL))
+                       return -ENOMEM;
+
+               mutex_lock(&ca->set->bucket_lock);
+
+               fifo_move(&free, &ca->free);
+               fifo_swap(&free, &ca->free);
+
+               mutex_unlock(&ca->set->bucket_lock);
+
+               while (fifo_pop(&free, i))
+                       atomic_dec(&ca->buckets[i].pin);
+
+               free_fifo(&free);
+       }
+
+       if (attr == &sysfs_clear_stats) {
+               atomic_long_set(&ca->sectors_written, 0);
+               atomic_long_set(&ca->btree_sectors_written, 0);
+               atomic_long_set(&ca->meta_sectors_written, 0);
+               atomic_set(&ca->io_count, 0);
+               atomic_set(&ca->io_errors, 0);
+       }
+
+       return size;
+}
+STORE_LOCKED(bch_cache)
+
+static struct attribute *bch_cache_files[] = {
+       &sysfs_bucket_size,
+       &sysfs_block_size,
+       &sysfs_nbuckets,
+       &sysfs_priority_stats,
+       &sysfs_discard,
+       &sysfs_written,
+       &sysfs_btree_written,
+       &sysfs_metadata_written,
+       &sysfs_io_errors,
+       &sysfs_clear_stats,
+       &sysfs_freelist_percent,
+       &sysfs_cache_replacement_policy,
+       NULL
+};
+KTYPE(bch_cache);
diff --git a/drivers/md/bcache/sysfs.h b/drivers/md/bcache/sysfs.h

new file mode 100644 (file)

index 0000000..0526fe9
--- /dev/null
+++ b/drivers/md/bcache/sysfs.h
@@ -0,0 +1,110 @@
+#ifndef _BCACHE_SYSFS_H_
+#define _BCACHE_SYSFS_H_
+
+#define KTYPE(type)                                                    \
+struct kobj_type type ## _ktype = {                                    \
+       .release        = type ## _release,                             \
+       .sysfs_ops      = &((const struct sysfs_ops) {                  \
+               .show   = type ## _show,                                \
+               .store  = type ## _store                                \
+       }),                                                             \
+       .default_attrs  = type ## _files                                \
+}
+
+#define SHOW(fn)                                                       \
+static ssize_t fn ## _show(struct kobject *kobj, struct attribute *attr,\
+                          char *buf)                                   \
+
+#define STORE(fn)                                                      \
+static ssize_t fn ## _store(struct kobject *kobj, struct attribute *attr,\
+                           const char *buf, size_t size)               \
+
+#define SHOW_LOCKED(fn)                                                        \
+SHOW(fn)                                                               \
+{                                                                      \
+       ssize_t ret;                                                    \
+       mutex_lock(&bch_register_lock);                                 \
+       ret = __ ## fn ## _show(kobj, attr, buf);                       \
+       mutex_unlock(&bch_register_lock);                               \
+       return ret;                                                     \
+}
+
+#define STORE_LOCKED(fn)                                               \
+STORE(fn)                                                              \
+{                                                                      \
+       ssize_t ret;                                                    \
+       mutex_lock(&bch_register_lock);                                 \
+       ret = __ ## fn ## _store(kobj, attr, buf, size);                \
+       mutex_unlock(&bch_register_lock);                               \
+       return ret;                                                     \
+}
+
+#define __sysfs_attribute(_name, _mode)                                        \
+       static struct attribute sysfs_##_name =                         \
+               { .name = #_name, .mode = _mode }
+
+#define write_attribute(n)     __sysfs_attribute(n, S_IWUSR)
+#define read_attribute(n)      __sysfs_attribute(n, S_IRUGO)
+#define rw_attribute(n)                __sysfs_attribute(n, S_IRUGO|S_IWUSR)
+
+#define sysfs_printf(file, fmt, ...)                                   \
+do {                                                                   \
+       if (attr == &sysfs_ ## file)                                    \
+               return snprintf(buf, PAGE_SIZE, fmt "\n", __VA_ARGS__); \
+} while (0)
+
+#define sysfs_print(file, var)                                         \
+do {                                                                   \
+       if (attr == &sysfs_ ## file)                                    \
+               return snprint(buf, PAGE_SIZE, var);                    \
+} while (0)
+
+#define sysfs_hprint(file, val)                                                \
+do {                                                                   \
+       if (attr == &sysfs_ ## file) {                                  \
+               ssize_t ret = bch_hprint(buf, val);                     \
+               strcat(buf, "\n");                                      \
+               return ret + 1;                                         \
+       }                                                               \
+} while (0)
+
+#define var_printf(_var, fmt)  sysfs_printf(_var, fmt, var(_var))
+#define var_print(_var)                sysfs_print(_var, var(_var))
+#define var_hprint(_var)       sysfs_hprint(_var, var(_var))
+
+#define sysfs_strtoul(file, var)                                       \
+do {                                                                   \
+       if (attr == &sysfs_ ## file)                                    \
+               return strtoul_safe(buf, var) ?: (ssize_t) size;        \
+} while (0)
+
+#define sysfs_strtoul_clamp(file, var, min, max)                       \
+do {                                                                   \
+       if (attr == &sysfs_ ## file)                                    \
+               return strtoul_safe_clamp(buf, var, min, max)           \
+                       ?: (ssize_t) size;                              \
+} while (0)
+
+#define strtoul_or_return(cp)                                          \
+({                                                                     \
+       unsigned long _v;                                               \
+       int _r = kstrtoul(cp, 10, &_v);                                 \
+       if (_r)                                                         \
+               return _r;                                              \
+       _v;                                                             \
+})
+
+#define strtoi_h_or_return(cp, v)                                      \
+do {                                                                   \
+       int _r = strtoi_h(cp, &v);                                      \
+       if (_r)                                                         \
+               return _r;                                              \
+} while (0)
+
+#define sysfs_hatoi(file, var)                                         \
+do {                                                                   \
+       if (attr == &sysfs_ ## file)                                    \
+               return strtoi_h(buf, &var) ?: (ssize_t) size;           \
+} while (0)
+
+#endif  /* _BCACHE_SYSFS_H_ */
diff --git a/drivers/md/bcache/trace.c b/drivers/md/bcache/trace.c

new file mode 100644 (file)

index 0000000..983f9bb
--- /dev/null
+++ b/drivers/md/bcache/trace.c
@@ -0,0 +1,26 @@
+#include "bcache.h"
+#include "btree.h"
+#include "request.h"
+
+#include <linux/module.h>
+
+#define CREATE_TRACE_POINTS
+#include <trace/events/bcache.h>
+
+EXPORT_TRACEPOINT_SYMBOL_GPL(bcache_request_start);
+EXPORT_TRACEPOINT_SYMBOL_GPL(bcache_request_end);
+EXPORT_TRACEPOINT_SYMBOL_GPL(bcache_passthrough);
+EXPORT_TRACEPOINT_SYMBOL_GPL(bcache_cache_hit);
+EXPORT_TRACEPOINT_SYMBOL_GPL(bcache_cache_miss);
+EXPORT_TRACEPOINT_SYMBOL_GPL(bcache_read_retry);
+EXPORT_TRACEPOINT_SYMBOL_GPL(bcache_writethrough);
+EXPORT_TRACEPOINT_SYMBOL_GPL(bcache_writeback);
+EXPORT_TRACEPOINT_SYMBOL_GPL(bcache_write_skip);
+EXPORT_TRACEPOINT_SYMBOL_GPL(bcache_btree_read);
+EXPORT_TRACEPOINT_SYMBOL_GPL(bcache_btree_write);
+EXPORT_TRACEPOINT_SYMBOL_GPL(bcache_write_dirty);
+EXPORT_TRACEPOINT_SYMBOL_GPL(bcache_read_dirty);
+EXPORT_TRACEPOINT_SYMBOL_GPL(bcache_journal_write);
+EXPORT_TRACEPOINT_SYMBOL_GPL(bcache_cache_insert);
+EXPORT_TRACEPOINT_SYMBOL_GPL(bcache_gc_start);
+EXPORT_TRACEPOINT_SYMBOL_GPL(bcache_gc_end);
diff --git a/drivers/md/bcache/util.c b/drivers/md/bcache/util.c

new file mode 100644 (file)

index 0000000..da3a99e
--- /dev/null
+++ b/drivers/md/bcache/util.c
@@ -0,0 +1,377 @@
+/*
+ * random utiility code, for bcache but in theory not specific to bcache
+ *
+ * Copyright 2010, 2011 Kent Overstreet <kent.overstreet@gmail.com>
+ * Copyright 2012 Google, Inc.
+ */
+
+#include <linux/bio.h>
+#include <linux/blkdev.h>
+#include <linux/ctype.h>
+#include <linux/debugfs.h>
+#include <linux/module.h>
+#include <linux/seq_file.h>
+#include <linux/types.h>
+
+#include "util.h"
+
+#define simple_strtoint(c, end, base)  simple_strtol(c, end, base)
+#define simple_strtouint(c, end, base) simple_strtoul(c, end, base)
+
+#define STRTO_H(name, type)                                    \
+int bch_ ## name ## _h(const char *cp, type *res)              \
+{                                                              \
+       int u = 0;                                              \
+       char *e;                                                \
+       type i = simple_ ## name(cp, &e, 10);                   \
+                                                               \
+       switch (tolower(*e)) {                                  \
+       default:                                                \
+               return -EINVAL;                                 \
+       case 'y':                                               \
+       case 'z':                                               \
+               u++;                                            \
+       case 'e':                                               \
+               u++;                                            \
+       case 'p':                                               \
+               u++;                                            \
+       case 't':                                               \
+               u++;                                            \
+       case 'g':                                               \
+               u++;                                            \
+       case 'm':                                               \
+               u++;                                            \
+       case 'k':                                               \
+               u++;                                            \
+               if (e++ == cp)                                  \
+                       return -EINVAL;                         \
+       case '\n':                                              \
+       case '\0':                                              \
+               if (*e == '\n')                                 \
+                       e++;                                    \
+       }                                                       \
+                                                               \
+       if (*e)                                                 \
+               return -EINVAL;                                 \
+                                                               \
+       while (u--) {                                           \
+               if ((type) ~0 > 0 &&                            \
+                   (type) ~0 / 1024 <= i)                      \
+                       return -EINVAL;                         \
+               if ((i > 0 && ANYSINT_MAX(type) / 1024 < i) ||  \
+                   (i < 0 && -ANYSINT_MAX(type) / 1024 > i))   \
+                       return -EINVAL;                         \
+               i *= 1024;                                      \
+       }                                                       \
+                                                               \
+       *res = i;                                               \
+       return 0;                                               \
+}                                                              \
+
+STRTO_H(strtoint, int)
+STRTO_H(strtouint, unsigned int)
+STRTO_H(strtoll, long long)
+STRTO_H(strtoull, unsigned long long)
+
+ssize_t bch_hprint(char *buf, int64_t v)
+{
+       static const char units[] = "?kMGTPEZY";
+       char dec[4] = "";
+       int u, t = 0;
+
+       for (u = 0; v >= 1024 || v <= -1024; u++) {
+               t = v & ~(~0 << 10);
+               v >>= 10;
+       }
+
+       if (!u)
+               return sprintf(buf, "%llu", v);
+
+       if (v < 100 && v > -100)
+               snprintf(dec, sizeof(dec), ".%i", t / 100);
+
+       return sprintf(buf, "%lli%s%c", v, dec, units[u]);
+}
+
+ssize_t bch_snprint_string_list(char *buf, size_t size, const char * const list[],
+                           size_t selected)
+{
+       char *out = buf;
+       size_t i;
+
+       for (i = 0; list[i]; i++)
+               out += snprintf(out, buf + size - out,
+                               i == selected ? "[%s] " : "%s ", list[i]);
+
+       out[-1] = '\n';
+       return out - buf;
+}
+
+ssize_t bch_read_string_list(const char *buf, const char * const list[])
+{
+       size_t i;
+       char *s, *d = kstrndup(buf, PAGE_SIZE - 1, GFP_KERNEL);
+       if (!d)
+               return -ENOMEM;
+
+       s = strim(d);
+
+       for (i = 0; list[i]; i++)
+               if (!strcmp(list[i], s))
+                       break;
+
+       kfree(d);
+
+       if (!list[i])
+               return -EINVAL;
+
+       return i;
+}
+
+bool bch_is_zero(const char *p, size_t n)
+{
+       size_t i;
+
+       for (i = 0; i < n; i++)
+               if (p[i])
+                       return false;
+       return true;
+}
+
+int bch_parse_uuid(const char *s, char *uuid)
+{
+       size_t i, j, x;
+       memset(uuid, 0, 16);
+
+       for (i = 0, j = 0;
+            i < strspn(s, "-0123456789:ABCDEFabcdef") && j < 32;
+            i++) {
+               x = s[i] | 32;
+
+               switch (x) {
+               case '0'...'9':
+                       x -= '0';
+                       break;
+               case 'a'...'f':
+                       x -= 'a' - 10;
+                       break;
+               default:
+                       continue;
+               }
+
+               if (!(j & 1))
+                       x <<= 4;
+               uuid[j++ >> 1] |= x;
+       }
+       return i;
+}
+
+void bch_time_stats_update(struct time_stats *stats, uint64_t start_time)
+{
+       uint64_t now            = local_clock();
+       uint64_t duration       = time_after64(now, start_time)
+               ? now - start_time : 0;
+       uint64_t last           = time_after64(now, stats->last)
+               ? now - stats->last : 0;
+
+       stats->max_duration = max(stats->max_duration, duration);
+
+       if (stats->last) {
+               ewma_add(stats->average_duration, duration, 8, 8);
+
+               if (stats->average_frequency)
+                       ewma_add(stats->average_frequency, last, 8, 8);
+               else
+                       stats->average_frequency  = last << 8;
+       } else {
+               stats->average_duration  = duration << 8;
+       }
+
+       stats->last = now ?: 1;
+}
+
+unsigned bch_next_delay(struct ratelimit *d, uint64_t done)
+{
+       uint64_t now = local_clock();
+
+       d->next += div_u64(done, d->rate);
+
+       return time_after64(d->next, now)
+               ? div_u64(d->next - now, NSEC_PER_SEC / HZ)
+               : 0;
+}
+
+void bch_bio_map(struct bio *bio, void *base)
+{
+       size_t size = bio->bi_size;
+       struct bio_vec *bv = bio->bi_io_vec;
+
+       BUG_ON(!bio->bi_size);
+       BUG_ON(bio->bi_vcnt);
+
+       bv->bv_offset = base ? ((unsigned long) base) % PAGE_SIZE : 0;
+       goto start;
+
+       for (; size; bio->bi_vcnt++, bv++) {
+               bv->bv_offset   = 0;
+start:         bv->bv_len      = min_t(size_t, PAGE_SIZE - bv->bv_offset,
+                                       size);
+               if (base) {
+                       bv->bv_page = is_vmalloc_addr(base)
+                               ? vmalloc_to_page(base)
+                               : virt_to_page(base);
+
+                       base += bv->bv_len;
+               }
+
+               size -= bv->bv_len;
+       }
+}
+
+int bch_bio_alloc_pages(struct bio *bio, gfp_t gfp)
+{
+       int i;
+       struct bio_vec *bv;
+
+       bio_for_each_segment(bv, bio, i) {
+               bv->bv_page = alloc_page(gfp);
+               if (!bv->bv_page) {
+                       while (bv-- != bio->bi_io_vec + bio->bi_idx)
+                               __free_page(bv->bv_page);
+                       return -ENOMEM;
+               }
+       }
+
+       return 0;
+}
+
+/*
+ * Portions Copyright (c) 1996-2001, PostgreSQL Global Development Group (Any
+ * use permitted, subject to terms of PostgreSQL license; see.)
+
+ * If we have a 64-bit integer type, then a 64-bit CRC looks just like the
+ * usual sort of implementation. (See Ross Williams' excellent introduction
+ * A PAINLESS GUIDE TO CRC ERROR DETECTION ALGORITHMS, available from
+ * ftp://ftp.rocksoft.com/papers/crc_v3.txt or several other net sites.)
+ * If we have no working 64-bit type, then fake it with two 32-bit registers.
+ *
+ * The present implementation is a normal (not "reflected", in Williams'
+ * terms) 64-bit CRC, using initial all-ones register contents and a final
+ * bit inversion. The chosen polynomial is borrowed from the DLT1 spec
+ * (ECMA-182, available from http://www.ecma.ch/ecma1/STAND/ECMA-182.HTM):
+ *
+ * x^64 + x^62 + x^57 + x^55 + x^54 + x^53 + x^52 + x^47 + x^46 + x^45 +
+ * x^40 + x^39 + x^38 + x^37 + x^35 + x^33 + x^32 + x^31 + x^29 + x^27 +
+ * x^24 + x^23 + x^22 + x^21 + x^19 + x^17 + x^13 + x^12 + x^10 + x^9 +
+ * x^7 + x^4 + x + 1
+*/
+
+static const uint64_t crc_table[256] = {
+       0x0000000000000000ULL, 0x42F0E1EBA9EA3693ULL, 0x85E1C3D753D46D26ULL,
+       0xC711223CFA3E5BB5ULL, 0x493366450E42ECDFULL, 0x0BC387AEA7A8DA4CULL,
+       0xCCD2A5925D9681F9ULL, 0x8E224479F47CB76AULL, 0x9266CC8A1C85D9BEULL,
+       0xD0962D61B56FEF2DULL, 0x17870F5D4F51B498ULL, 0x5577EEB6E6BB820BULL,
+       0xDB55AACF12C73561ULL, 0x99A54B24BB2D03F2ULL, 0x5EB4691841135847ULL,
+       0x1C4488F3E8F96ED4ULL, 0x663D78FF90E185EFULL, 0x24CD9914390BB37CULL,
+       0xE3DCBB28C335E8C9ULL, 0xA12C5AC36ADFDE5AULL, 0x2F0E1EBA9EA36930ULL,
+       0x6DFEFF5137495FA3ULL, 0xAAEFDD6DCD770416ULL, 0xE81F3C86649D3285ULL,
+       0xF45BB4758C645C51ULL, 0xB6AB559E258E6AC2ULL, 0x71BA77A2DFB03177ULL,
+       0x334A9649765A07E4ULL, 0xBD68D2308226B08EULL, 0xFF9833DB2BCC861DULL,
+       0x388911E7D1F2DDA8ULL, 0x7A79F00C7818EB3BULL, 0xCC7AF1FF21C30BDEULL,
+       0x8E8A101488293D4DULL, 0x499B3228721766F8ULL, 0x0B6BD3C3DBFD506BULL,
+       0x854997BA2F81E701ULL, 0xC7B97651866BD192ULL, 0x00A8546D7C558A27ULL,
+       0x4258B586D5BFBCB4ULL, 0x5E1C3D753D46D260ULL, 0x1CECDC9E94ACE4F3ULL,
+       0xDBFDFEA26E92BF46ULL, 0x990D1F49C77889D5ULL, 0x172F5B3033043EBFULL,
+       0x55DFBADB9AEE082CULL, 0x92CE98E760D05399ULL, 0xD03E790CC93A650AULL,
+       0xAA478900B1228E31ULL, 0xE8B768EB18C8B8A2ULL, 0x2FA64AD7E2F6E317ULL,
+       0x6D56AB3C4B1CD584ULL, 0xE374EF45BF6062EEULL, 0xA1840EAE168A547DULL,
+       0x66952C92ECB40FC8ULL, 0x2465CD79455E395BULL, 0x3821458AADA7578FULL,
+       0x7AD1A461044D611CULL, 0xBDC0865DFE733AA9ULL, 0xFF3067B657990C3AULL,
+       0x711223CFA3E5BB50ULL, 0x33E2C2240A0F8DC3ULL, 0xF4F3E018F031D676ULL,
+       0xB60301F359DBE0E5ULL, 0xDA050215EA6C212FULL, 0x98F5E3FE438617BCULL,
+       0x5FE4C1C2B9B84C09ULL, 0x1D14202910527A9AULL, 0x93366450E42ECDF0ULL,
+       0xD1C685BB4DC4FB63ULL, 0x16D7A787B7FAA0D6ULL, 0x5427466C1E109645ULL,
+       0x4863CE9FF6E9F891ULL, 0x0A932F745F03CE02ULL, 0xCD820D48A53D95B7ULL,
+       0x8F72ECA30CD7A324ULL, 0x0150A8DAF8AB144EULL, 0x43A04931514122DDULL,
+       0x84B16B0DAB7F7968ULL, 0xC6418AE602954FFBULL, 0xBC387AEA7A8DA4C0ULL,
+       0xFEC89B01D3679253ULL, 0x39D9B93D2959C9E6ULL, 0x7B2958D680B3FF75ULL,
+       0xF50B1CAF74CF481FULL, 0xB7FBFD44DD257E8CULL, 0x70EADF78271B2539ULL,
+       0x321A3E938EF113AAULL, 0x2E5EB66066087D7EULL, 0x6CAE578BCFE24BEDULL,
+       0xABBF75B735DC1058ULL, 0xE94F945C9C3626CBULL, 0x676DD025684A91A1ULL,
+       0x259D31CEC1A0A732ULL, 0xE28C13F23B9EFC87ULL, 0xA07CF2199274CA14ULL,
+       0x167FF3EACBAF2AF1ULL, 0x548F120162451C62ULL, 0x939E303D987B47D7ULL,
+       0xD16ED1D631917144ULL, 0x5F4C95AFC5EDC62EULL, 0x1DBC74446C07F0BDULL,
+       0xDAAD56789639AB08ULL, 0x985DB7933FD39D9BULL, 0x84193F60D72AF34FULL,
+       0xC6E9DE8B7EC0C5DCULL, 0x01F8FCB784FE9E69ULL, 0x43081D5C2D14A8FAULL,
+       0xCD2A5925D9681F90ULL, 0x8FDAB8CE70822903ULL, 0x48CB9AF28ABC72B6ULL,
+       0x0A3B7B1923564425ULL, 0x70428B155B4EAF1EULL, 0x32B26AFEF2A4998DULL,
+       0xF5A348C2089AC238ULL, 0xB753A929A170F4ABULL, 0x3971ED50550C43C1ULL,
+       0x7B810CBBFCE67552ULL, 0xBC902E8706D82EE7ULL, 0xFE60CF6CAF321874ULL,
+       0xE224479F47CB76A0ULL, 0xA0D4A674EE214033ULL, 0x67C58448141F1B86ULL,
+       0x253565A3BDF52D15ULL, 0xAB1721DA49899A7FULL, 0xE9E7C031E063ACECULL,
+       0x2EF6E20D1A5DF759ULL, 0x6C0603E6B3B7C1CAULL, 0xF6FAE5C07D3274CDULL,
+       0xB40A042BD4D8425EULL, 0x731B26172EE619EBULL, 0x31EBC7FC870C2F78ULL,
+       0xBFC9838573709812ULL, 0xFD39626EDA9AAE81ULL, 0x3A28405220A4F534ULL,
+       0x78D8A1B9894EC3A7ULL, 0x649C294A61B7AD73ULL, 0x266CC8A1C85D9BE0ULL,
+       0xE17DEA9D3263C055ULL, 0xA38D0B769B89F6C6ULL, 0x2DAF4F0F6FF541ACULL,
+       0x6F5FAEE4C61F773FULL, 0xA84E8CD83C212C8AULL, 0xEABE6D3395CB1A19ULL,
+       0x90C79D3FEDD3F122ULL, 0xD2377CD44439C7B1ULL, 0x15265EE8BE079C04ULL,
+       0x57D6BF0317EDAA97ULL, 0xD9F4FB7AE3911DFDULL, 0x9B041A914A7B2B6EULL,
+       0x5C1538ADB04570DBULL, 0x1EE5D94619AF4648ULL, 0x02A151B5F156289CULL,
+       0x4051B05E58BC1E0FULL, 0x87409262A28245BAULL, 0xC5B073890B687329ULL,
+       0x4B9237F0FF14C443ULL, 0x0962D61B56FEF2D0ULL, 0xCE73F427ACC0A965ULL,
+       0x8C8315CC052A9FF6ULL, 0x3A80143F5CF17F13ULL, 0x7870F5D4F51B4980ULL,
+       0xBF61D7E80F251235ULL, 0xFD913603A6CF24A6ULL, 0x73B3727A52B393CCULL,
+       0x31439391FB59A55FULL, 0xF652B1AD0167FEEAULL, 0xB4A25046A88DC879ULL,
+       0xA8E6D8B54074A6ADULL, 0xEA16395EE99E903EULL, 0x2D071B6213A0CB8BULL,
+       0x6FF7FA89BA4AFD18ULL, 0xE1D5BEF04E364A72ULL, 0xA3255F1BE7DC7CE1ULL,
+       0x64347D271DE22754ULL, 0x26C49CCCB40811C7ULL, 0x5CBD6CC0CC10FAFCULL,
+       0x1E4D8D2B65FACC6FULL, 0xD95CAF179FC497DAULL, 0x9BAC4EFC362EA149ULL,
+       0x158E0A85C2521623ULL, 0x577EEB6E6BB820B0ULL, 0x906FC95291867B05ULL,
+       0xD29F28B9386C4D96ULL, 0xCEDBA04AD0952342ULL, 0x8C2B41A1797F15D1ULL,
+       0x4B3A639D83414E64ULL, 0x09CA82762AAB78F7ULL, 0x87E8C60FDED7CF9DULL,
+       0xC51827E4773DF90EULL, 0x020905D88D03A2BBULL, 0x40F9E43324E99428ULL,
+       0x2CFFE7D5975E55E2ULL, 0x6E0F063E3EB46371ULL, 0xA91E2402C48A38C4ULL,
+       0xEBEEC5E96D600E57ULL, 0x65CC8190991CB93DULL, 0x273C607B30F68FAEULL,
+       0xE02D4247CAC8D41BULL, 0xA2DDA3AC6322E288ULL, 0xBE992B5F8BDB8C5CULL,
+       0xFC69CAB42231BACFULL, 0x3B78E888D80FE17AULL, 0x7988096371E5D7E9ULL,
+       0xF7AA4D1A85996083ULL, 0xB55AACF12C735610ULL, 0x724B8ECDD64D0DA5ULL,
+       0x30BB6F267FA73B36ULL, 0x4AC29F2A07BFD00DULL, 0x08327EC1AE55E69EULL,
+       0xCF235CFD546BBD2BULL, 0x8DD3BD16FD818BB8ULL, 0x03F1F96F09FD3CD2ULL,
+       0x41011884A0170A41ULL, 0x86103AB85A2951F4ULL, 0xC4E0DB53F3C36767ULL,
+       0xD8A453A01B3A09B3ULL, 0x9A54B24BB2D03F20ULL, 0x5D45907748EE6495ULL,
+       0x1FB5719CE1045206ULL, 0x919735E51578E56CULL, 0xD367D40EBC92D3FFULL,
+       0x1476F63246AC884AULL, 0x568617D9EF46BED9ULL, 0xE085162AB69D5E3CULL,
+       0xA275F7C11F7768AFULL, 0x6564D5FDE549331AULL, 0x279434164CA30589ULL,
+       0xA9B6706FB8DFB2E3ULL, 0xEB46918411358470ULL, 0x2C57B3B8EB0BDFC5ULL,
+       0x6EA7525342E1E956ULL, 0x72E3DAA0AA188782ULL, 0x30133B4B03F2B111ULL,
+       0xF7021977F9CCEAA4ULL, 0xB5F2F89C5026DC37ULL, 0x3BD0BCE5A45A6B5DULL,
+       0x79205D0E0DB05DCEULL, 0xBE317F32F78E067BULL, 0xFCC19ED95E6430E8ULL,
+       0x86B86ED5267CDBD3ULL, 0xC4488F3E8F96ED40ULL, 0x0359AD0275A8B6F5ULL,
+       0x41A94CE9DC428066ULL, 0xCF8B0890283E370CULL, 0x8D7BE97B81D4019FULL,
+       0x4A6ACB477BEA5A2AULL, 0x089A2AACD2006CB9ULL, 0x14DEA25F3AF9026DULL,
+       0x562E43B4931334FEULL, 0x913F6188692D6F4BULL, 0xD3CF8063C0C759D8ULL,
+       0x5DEDC41A34BBEEB2ULL, 0x1F1D25F19D51D821ULL, 0xD80C07CD676F8394ULL,
+       0x9AFCE626CE85B507ULL,
+};
+
+uint64_t bch_crc64_update(uint64_t crc, const void *_data, size_t len)
+{
+       const unsigned char *data = _data;
+
+       while (len--) {
+               int i = ((int) (crc >> 56) ^ *data++) & 0xFF;
+               crc = crc_table[i] ^ (crc << 8);
+       }
+
+       return crc;
+}
+
+uint64_t bch_crc64(const void *data, size_t len)
+{
+       uint64_t crc = 0xffffffffffffffffULL;
+
+       crc = bch_crc64_update(crc, data, len);
+
+       return crc ^ 0xffffffffffffffffULL;
+}
diff --git a/drivers/md/bcache/util.h b/drivers/md/bcache/util.h

new file mode 100644 (file)

index 0000000..577393e
--- /dev/null
+++ b/drivers/md/bcache/util.h
@@ -0,0 +1,589 @@
+
+#ifndef _BCACHE_UTIL_H
+#define _BCACHE_UTIL_H
+
+#include <linux/errno.h>
+#include <linux/kernel.h>
+#include <linux/llist.h>
+#include <linux/ratelimit.h>
+#include <linux/vmalloc.h>
+#include <linux/workqueue.h>
+
+#include "closure.h"
+
+#define PAGE_SECTORS           (PAGE_SIZE / 512)
+
+struct closure;
+
+#include <trace/events/bcache.h>
+
+#ifdef CONFIG_BCACHE_EDEBUG
+
+#define atomic_dec_bug(v)      BUG_ON(atomic_dec_return(v) < 0)
+#define atomic_inc_bug(v, i)   BUG_ON(atomic_inc_return(v) <= i)
+
+#else /* EDEBUG */
+
+#define atomic_dec_bug(v)      atomic_dec(v)
+#define atomic_inc_bug(v, i)   atomic_inc(v)
+
+#endif
+
+#define BITMASK(name, type, field, offset, size)               \
+static inline uint64_t name(const type *k)                     \
+{ return (k->field >> offset) & ~(((uint64_t) ~0) << size); }  \
+                                                               \
+static inline void SET_##name(type *k, uint64_t v)             \
+{                                                              \
+       k->field &= ~(~((uint64_t) ~0 << size) << offset);      \
+       k->field |= v << offset;                                \
+}
+
+#define DECLARE_HEAP(type, name)                                       \
+       struct {                                                        \
+               size_t size, used;                                      \
+               type *data;                                             \
+       } name
+
+#define init_heap(heap, _size, gfp)                                    \
+({                                                                     \
+       size_t _bytes;                                                  \
+       (heap)->used = 0;                                               \
+       (heap)->size = (_size);                                         \
+       _bytes = (heap)->size * sizeof(*(heap)->data);                  \
+       (heap)->data = NULL;                                            \
+       if (_bytes < KMALLOC_MAX_SIZE)                                  \
+               (heap)->data = kmalloc(_bytes, (gfp));                  \
+       if ((!(heap)->data) && ((gfp) & GFP_KERNEL))                    \
+               (heap)->data = vmalloc(_bytes);                         \
+       (heap)->data;                                                   \
+})
+
+#define free_heap(heap)                                                        \
+do {                                                                   \
+       if (is_vmalloc_addr((heap)->data))                              \
+               vfree((heap)->data);                                    \
+       else                                                            \
+               kfree((heap)->data);                                    \
+       (heap)->data = NULL;                                            \
+} while (0)
+
+#define heap_swap(h, i, j)     swap((h)->data[i], (h)->data[j])
+
+#define heap_sift(h, i, cmp)                                           \
+do {                                                                   \
+       size_t _r, _j = i;                                              \
+                                                                       \
+       for (; _j * 2 + 1 < (h)->used; _j = _r) {                       \
+               _r = _j * 2 + 1;                                        \
+               if (_r + 1 < (h)->used &&                               \
+                   cmp((h)->data[_r], (h)->data[_r + 1]))              \
+                       _r++;                                           \
+                                                                       \
+               if (cmp((h)->data[_r], (h)->data[_j]))                  \
+                       break;                                          \
+               heap_swap(h, _r, _j);                                   \
+       }                                                               \
+} while (0)
+
+#define heap_sift_down(h, i, cmp)                                      \
+do {                                                                   \
+       while (i) {                                                     \
+               size_t p = (i - 1) / 2;                                 \
+               if (cmp((h)->data[i], (h)->data[p]))                    \
+                       break;                                          \
+               heap_swap(h, i, p);                                     \
+               i = p;                                                  \
+       }                                                               \
+} while (0)
+
+#define heap_add(h, d, cmp)                                            \
+({                                                                     \
+       bool _r = !heap_full(h);                                        \
+       if (_r) {                                                       \
+               size_t _i = (h)->used++;                                \
+               (h)->data[_i] = d;                                      \
+                                                                       \
+               heap_sift_down(h, _i, cmp);                             \
+               heap_sift(h, _i, cmp);                                  \
+       }                                                               \
+       _r;                                                             \
+})
+
+#define heap_pop(h, d, cmp)                                            \
+({                                                                     \
+       bool _r = (h)->used;                                            \
+       if (_r) {                                                       \
+               (d) = (h)->data[0];                                     \
+               (h)->used--;                                            \
+               heap_swap(h, 0, (h)->used);                             \
+               heap_sift(h, 0, cmp);                                   \
+       }                                                               \
+       _r;                                                             \
+})
+
+#define heap_peek(h)   ((h)->size ? (h)->data[0] : NULL)
+
+#define heap_full(h)   ((h)->used == (h)->size)
+
+#define DECLARE_FIFO(type, name)                                       \
+       struct {                                                        \
+               size_t front, back, size, mask;                         \
+               type *data;                                             \
+       } name
+
+#define fifo_for_each(c, fifo, iter)                                   \
+       for (iter = (fifo)->front;                                      \
+            c = (fifo)->data[iter], iter != (fifo)->back;              \
+            iter = (iter + 1) & (fifo)->mask)
+
+#define __init_fifo(fifo, gfp)                                         \
+({                                                                     \
+       size_t _allocated_size, _bytes;                                 \
+       BUG_ON(!(fifo)->size);                                          \
+                                                                       \
+       _allocated_size = roundup_pow_of_two((fifo)->size + 1);         \
+       _bytes = _allocated_size * sizeof(*(fifo)->data);               \
+                                                                       \
+       (fifo)->mask = _allocated_size - 1;                             \
+       (fifo)->front = (fifo)->back = 0;                               \
+       (fifo)->data = NULL;                                            \
+                                                                       \
+       if (_bytes < KMALLOC_MAX_SIZE)                                  \
+               (fifo)->data = kmalloc(_bytes, (gfp));                  \
+       if ((!(fifo)->data) && ((gfp) & GFP_KERNEL))                    \
+               (fifo)->data = vmalloc(_bytes);                         \
+       (fifo)->data;                                                   \
+})
+
+#define init_fifo_exact(fifo, _size, gfp)                              \
+({                                                                     \
+       (fifo)->size = (_size);                                         \
+       __init_fifo(fifo, gfp);                                         \
+})
+
+#define init_fifo(fifo, _size, gfp)                                    \
+({                                                                     \
+       (fifo)->size = (_size);                                         \
+       if ((fifo)->size > 4)                                           \
+               (fifo)->size = roundup_pow_of_two((fifo)->size) - 1;    \
+       __init_fifo(fifo, gfp);                                         \
+})
+
+#define free_fifo(fifo)                                                        \
+do {                                                                   \
+       if (is_vmalloc_addr((fifo)->data))                              \
+               vfree((fifo)->data);                                    \
+       else                                                            \
+               kfree((fifo)->data);                                    \
+       (fifo)->data = NULL;                                            \
+} while (0)
+
+#define fifo_used(fifo)                (((fifo)->back - (fifo)->front) & (fifo)->mask)
+#define fifo_free(fifo)                ((fifo)->size - fifo_used(fifo))
+
+#define fifo_empty(fifo)       (!fifo_used(fifo))
+#define fifo_full(fifo)                (!fifo_free(fifo))
+
+#define fifo_front(fifo)       ((fifo)->data[(fifo)->front])
+#define fifo_back(fifo)                                                        \
+       ((fifo)->data[((fifo)->back - 1) & (fifo)->mask])
+
+#define fifo_idx(fifo, p)      (((p) - &fifo_front(fifo)) & (fifo)->mask)
+
+#define fifo_push_back(fifo, i)                                                \
+({                                                                     \
+       bool _r = !fifo_full((fifo));                                   \
+       if (_r) {                                                       \
+               (fifo)->data[(fifo)->back++] = (i);                     \
+               (fifo)->back &= (fifo)->mask;                           \
+       }                                                               \
+       _r;                                                             \
+})
+
+#define fifo_pop_front(fifo, i)                                                \
+({                                                                     \
+       bool _r = !fifo_empty((fifo));                                  \
+       if (_r) {                                                       \
+               (i) = (fifo)->data[(fifo)->front++];                    \
+               (fifo)->front &= (fifo)->mask;                          \
+       }                                                               \
+       _r;                                                             \
+})
+
+#define fifo_push_front(fifo, i)                                       \
+({                                                                     \
+       bool _r = !fifo_full((fifo));                                   \
+       if (_r) {                                                       \
+               --(fifo)->front;                                        \
+               (fifo)->front &= (fifo)->mask;                          \
+               (fifo)->data[(fifo)->front] = (i);                      \
+       }                                                               \
+       _r;                                                             \
+})
+
+#define fifo_pop_back(fifo, i)                                         \
+({                                                                     \
+       bool _r = !fifo_empty((fifo));                                  \
+       if (_r) {                                                       \
+               --(fifo)->back;                                         \
+               (fifo)->back &= (fifo)->mask;                           \
+               (i) = (fifo)->data[(fifo)->back]                        \
+       }                                                               \
+       _r;                                                             \
+})
+
+#define fifo_push(fifo, i)     fifo_push_back(fifo, (i))
+#define fifo_pop(fifo, i)      fifo_pop_front(fifo, (i))
+
+#define fifo_swap(l, r)                                                        \
+do {                                                                   \
+       swap((l)->front, (r)->front);                                   \
+       swap((l)->back, (r)->back);                                     \
+       swap((l)->size, (r)->size);                                     \
+       swap((l)->mask, (r)->mask);                                     \
+       swap((l)->data, (r)->data);                                     \
+} while (0)
+
+#define fifo_move(dest, src)                                           \
+do {                                                                   \
+       typeof(*((dest)->data)) _t;                                     \
+       while (!fifo_full(dest) &&                                      \
+              fifo_pop(src, _t))                                       \
+               fifo_push(dest, _t);                                    \
+} while (0)
+
+/*
+ * Simple array based allocator - preallocates a number of elements and you can
+ * never allocate more than that, also has no locking.
+ *
+ * Handy because if you know you only need a fixed number of elements you don't
+ * have to worry about memory allocation failure, and sometimes a mempool isn't
+ * what you want.
+ *
+ * We treat the free elements as entries in a singly linked list, and the
+ * freelist as a stack - allocating and freeing push and pop off the freelist.
+ */
+
+#define DECLARE_ARRAY_ALLOCATOR(type, name, size)                      \
+       struct {                                                        \
+               type    *freelist;                                      \
+               type    data[size];                                     \
+       } name
+
+#define array_alloc(array)                                             \
+({                                                                     \
+       typeof((array)->freelist) _ret = (array)->freelist;             \
+                                                                       \
+       if (_ret)                                                       \
+               (array)->freelist = *((typeof((array)->freelist) *) _ret);\
+                                                                       \
+       _ret;                                                           \
+})
+
+#define array_free(array, ptr)                                         \
+do {                                                                   \
+       typeof((array)->freelist) _ptr = ptr;                           \
+                                                                       \
+       *((typeof((array)->freelist) *) _ptr) = (array)->freelist;      \
+       (array)->freelist = _ptr;                                       \
+} while (0)
+
+#define array_allocator_init(array)                                    \
+do {                                                                   \
+       typeof((array)->freelist) _i;                                   \
+                                                                       \
+       BUILD_BUG_ON(sizeof((array)->data[0]) < sizeof(void *));        \
+       (array)->freelist = NULL;                                       \
+                                                                       \
+       for (_i = (array)->data;                                        \
+            _i < (array)->data + ARRAY_SIZE((array)->data);            \
+            _i++)                                                      \
+               array_free(array, _i);                                  \
+} while (0)
+
+#define array_freelist_empty(array)    ((array)->freelist == NULL)
+
+#define ANYSINT_MAX(t)                                                 \
+       ((((t) 1 << (sizeof(t) * 8 - 2)) - (t) 1) * (t) 2 + (t) 1)
+
+int bch_strtoint_h(const char *, int *);
+int bch_strtouint_h(const char *, unsigned int *);
+int bch_strtoll_h(const char *, long long *);
+int bch_strtoull_h(const char *, unsigned long long *);
+
+static inline int bch_strtol_h(const char *cp, long *res)
+{
+#if BITS_PER_LONG == 32
+       return bch_strtoint_h(cp, (int *) res);
+#else
+       return bch_strtoll_h(cp, (long long *) res);
+#endif
+}
+
+static inline int bch_strtoul_h(const char *cp, long *res)
+{
+#if BITS_PER_LONG == 32
+       return bch_strtouint_h(cp, (unsigned int *) res);
+#else
+       return bch_strtoull_h(cp, (unsigned long long *) res);
+#endif
+}
+
+#define strtoi_h(cp, res)                                              \
+       (__builtin_types_compatible_p(typeof(*res), int)                \
+       ? bch_strtoint_h(cp, (void *) res)                              \
+       : __builtin_types_compatible_p(typeof(*res), long)              \
+       ? bch_strtol_h(cp, (void *) res)                                \
+       : __builtin_types_compatible_p(typeof(*res), long long)         \
+       ? bch_strtoll_h(cp, (void *) res)                               \
+       : __builtin_types_compatible_p(typeof(*res), unsigned int)      \
+       ? bch_strtouint_h(cp, (void *) res)                             \
+       : __builtin_types_compatible_p(typeof(*res), unsigned long)     \
+       ? bch_strtoul_h(cp, (void *) res)                               \
+       : __builtin_types_compatible_p(typeof(*res), unsigned long long)\
+       ? bch_strtoull_h(cp, (void *) res) : -EINVAL)
+
+#define strtoul_safe(cp, var)                                          \
+({                                                                     \
+       unsigned long _v;                                               \
+       int _r = kstrtoul(cp, 10, &_v);                                 \
+       if (!_r)                                                        \
+               var = _v;                                               \
+       _r;                                                             \
+})
+
+#define strtoul_safe_clamp(cp, var, min, max)                          \
+({                                                                     \
+       unsigned long _v;                                               \
+       int _r = kstrtoul(cp, 10, &_v);                                 \
+       if (!_r)                                                        \
+               var = clamp_t(typeof(var), _v, min, max);               \
+       _r;                                                             \
+})
+
+#define snprint(buf, size, var)                                                \
+       snprintf(buf, size,                                             \
+               __builtin_types_compatible_p(typeof(var), int)          \
+                    ? "%i\n" :                                         \
+               __builtin_types_compatible_p(typeof(var), unsigned)     \
+                    ? "%u\n" :                                         \
+               __builtin_types_compatible_p(typeof(var), long)         \
+                    ? "%li\n" :                                        \
+               __builtin_types_compatible_p(typeof(var), unsigned long)\
+                    ? "%lu\n" :                                        \
+               __builtin_types_compatible_p(typeof(var), int64_t)      \
+                    ? "%lli\n" :                                       \
+               __builtin_types_compatible_p(typeof(var), uint64_t)     \
+                    ? "%llu\n" :                                       \
+               __builtin_types_compatible_p(typeof(var), const char *) \
+                    ? "%s\n" : "%i\n", var)
+
+ssize_t bch_hprint(char *buf, int64_t v);
+
+bool bch_is_zero(const char *p, size_t n);
+int bch_parse_uuid(const char *s, char *uuid);
+
+ssize_t bch_snprint_string_list(char *buf, size_t size, const char * const list[],
+                           size_t selected);
+
+ssize_t bch_read_string_list(const char *buf, const char * const list[]);
+
+struct time_stats {
+       /*
+        * all fields are in nanoseconds, averages are ewmas stored left shifted
+        * by 8
+        */
+       uint64_t        max_duration;
+       uint64_t        average_duration;
+       uint64_t        average_frequency;
+       uint64_t        last;
+};
+
+void bch_time_stats_update(struct time_stats *stats, uint64_t time);
+
+#define NSEC_PER_ns                    1L
+#define NSEC_PER_us                    NSEC_PER_USEC
+#define NSEC_PER_ms                    NSEC_PER_MSEC
+#define NSEC_PER_sec                   NSEC_PER_SEC
+
+#define __print_time_stat(stats, name, stat, units)                    \
+       sysfs_print(name ## _ ## stat ## _ ## units,                    \
+                   div_u64((stats)->stat >> 8, NSEC_PER_ ## units))
+
+#define sysfs_print_time_stats(stats, name,                            \
+                              frequency_units,                         \
+                              duration_units)                          \
+do {                                                                   \
+       __print_time_stat(stats, name,                                  \
+                         average_frequency,    frequency_units);       \
+       __print_time_stat(stats, name,                                  \
+                         average_duration,     duration_units);        \
+       __print_time_stat(stats, name,                                  \
+                         max_duration,         duration_units);        \
+                                                                       \
+       sysfs_print(name ## _last_ ## frequency_units, (stats)->last    \
+                   ? div_s64(local_clock() - (stats)->last,            \
+                             NSEC_PER_ ## frequency_units)             \
+                   : -1LL);                                            \
+} while (0)
+
+#define sysfs_time_stats_attribute(name,                               \
+                                  frequency_units,                     \
+                                  duration_units)                      \
+read_attribute(name ## _average_frequency_ ## frequency_units);                \
+read_attribute(name ## _average_duration_ ## duration_units);          \
+read_attribute(name ## _max_duration_ ## duration_units);              \
+read_attribute(name ## _last_ ## frequency_units)
+
+#define sysfs_time_stats_attribute_list(name,                          \
+                                       frequency_units,                \
+                                       duration_units)                 \
+&sysfs_ ## name ## _average_frequency_ ## frequency_units,             \
+&sysfs_ ## name ## _average_duration_ ## duration_units,               \
+&sysfs_ ## name ## _max_duration_ ## duration_units,                   \
+&sysfs_ ## name ## _last_ ## frequency_units,
+
+#define ewma_add(ewma, val, weight, factor)                            \
+({                                                                     \
+       (ewma) *= (weight) - 1;                                         \
+       (ewma) += (val) << factor;                                      \
+       (ewma) /= (weight);                                             \
+       (ewma) >> factor;                                               \
+})
+
+struct ratelimit {
+       uint64_t                next;
+       unsigned                rate;
+};
+
+static inline void ratelimit_reset(struct ratelimit *d)
+{
+       d->next = local_clock();
+}
+
+unsigned bch_next_delay(struct ratelimit *d, uint64_t done);
+
+#define __DIV_SAFE(n, d, zero)                                         \
+({                                                                     \
+       typeof(n) _n = (n);                                             \
+       typeof(d) _d = (d);                                             \
+       _d ? _n / _d : zero;                                            \
+})
+
+#define DIV_SAFE(n, d) __DIV_SAFE(n, d, 0)
+
+#define container_of_or_null(ptr, type, member)                                \
+({                                                                     \
+       typeof(ptr) _ptr = ptr;                                         \
+       _ptr ? container_of(_ptr, type, member) : NULL;                 \
+})
+
+#define RB_INSERT(root, new, member, cmp)                              \
+({                                                                     \
+       __label__ dup;                                                  \
+       struct rb_node **n = &(root)->rb_node, *parent = NULL;          \
+       typeof(new) this;                                               \
+       int res, ret = -1;                                              \
+                                                                       \
+       while (*n) {                                                    \
+               parent = *n;                                            \
+               this = container_of(*n, typeof(*(new)), member);        \
+               res = cmp(new, this);                                   \
+               if (!res)                                               \
+                       goto dup;                                       \
+               n = res < 0                                             \
+                       ? &(*n)->rb_left                                \
+                       : &(*n)->rb_right;                              \
+       }                                                               \
+                                                                       \
+       rb_link_node(&(new)->member, parent, n);                        \
+       rb_insert_color(&(new)->member, root);                          \
+       ret = 0;                                                        \
+dup:                                                                   \
+       ret;                                                            \
+})
+
+#define RB_SEARCH(root, search, member, cmp)                           \
+({                                                                     \
+       struct rb_node *n = (root)->rb_node;                            \
+       typeof(&(search)) this, ret = NULL;                             \
+       int res;                                                        \
+                                                                       \
+       while (n) {                                                     \
+               this = container_of(n, typeof(search), member);         \
+               res = cmp(&(search), this);                             \
+               if (!res) {                                             \
+                       ret = this;                                     \
+                       break;                                          \
+               }                                                       \
+               n = res < 0                                             \
+                       ? n->rb_left                                    \
+                       : n->rb_right;                                  \
+       }                                                               \
+       ret;                                                            \
+})
+
+#define RB_GREATER(root, search, member, cmp)                          \
+({                                                                     \
+       struct rb_node *n = (root)->rb_node;                            \
+       typeof(&(search)) this, ret = NULL;                             \
+       int res;                                                        \
+                                                                       \
+       while (n) {                                                     \
+               this = container_of(n, typeof(search), member);         \
+               res = cmp(&(search), this);                             \
+               if (res < 0) {                                          \
+                       ret = this;                                     \
+                       n = n->rb_left;                                 \
+               } else                                                  \
+                       n = n->rb_right;                                \
+       }                                                               \
+       ret;                                                            \
+})
+
+#define RB_FIRST(root, type, member)                                   \
+       container_of_or_null(rb_first(root), type, member)
+
+#define RB_LAST(root, type, member)                                    \
+       container_of_or_null(rb_last(root), type, member)
+
+#define RB_NEXT(ptr, member)                                           \
+       container_of_or_null(rb_next(&(ptr)->member), typeof(*ptr), member)
+
+#define RB_PREV(ptr, member)                                           \
+       container_of_or_null(rb_prev(&(ptr)->member), typeof(*ptr), member)
+
+/* Does linear interpolation between powers of two */
+static inline unsigned fract_exp_two(unsigned x, unsigned fract_bits)
+{
+       unsigned fract = x & ~(~0 << fract_bits);
+
+       x >>= fract_bits;
+       x   = 1 << x;
+       x  += (x * fract) >> fract_bits;
+
+       return x;
+}
+
+#define bio_end(bio)   ((bio)->bi_sector + bio_sectors(bio))
+
+void bch_bio_map(struct bio *bio, void *base);
+
+int bch_bio_alloc_pages(struct bio *bio, gfp_t gfp);
+
+static inline sector_t bdev_sectors(struct block_device *bdev)
+{
+       return bdev->bd_inode->i_size >> 9;
+}
+
+#define closure_bio_submit(bio, cl, dev)                               \
+do {                                                                   \
+       closure_get(cl);                                                \
+       bch_generic_make_request(bio, &(dev)->bio_split_hook);          \
+} while (0)
+
+uint64_t bch_crc64_update(uint64_t, const void *, size_t);
+uint64_t bch_crc64(const void *, size_t);
+
+#endif /* _BCACHE_UTIL_H */
diff --git a/drivers/md/bcache/writeback.c b/drivers/md/bcache/writeback.c

new file mode 100644 (file)

index 0000000..93e7e31
--- /dev/null
+++ b/drivers/md/bcache/writeback.c
@@ -0,0 +1,414 @@
+/*
+ * background writeback - scan btree for dirty data and write it to the backing
+ * device
+ *
+ * Copyright 2010, 2011 Kent Overstreet <kent.overstreet@gmail.com>
+ * Copyright 2012 Google, Inc.
+ */
+
+#include "bcache.h"
+#include "btree.h"
+#include "debug.h"
+
+static struct workqueue_struct *dirty_wq;
+
+static void read_dirty(struct closure *);
+
+struct dirty_io {
+       struct closure          cl;
+       struct cached_dev       *dc;
+       struct bio              bio;
+};
+
+/* Rate limiting */
+
+static void __update_writeback_rate(struct cached_dev *dc)
+{
+       struct cache_set *c = dc->disk.c;
+       uint64_t cache_sectors = c->nbuckets * c->sb.bucket_size;
+       uint64_t cache_dirty_target =
+               div_u64(cache_sectors * dc->writeback_percent, 100);
+
+       int64_t target = div64_u64(cache_dirty_target * bdev_sectors(dc->bdev),
+                                  c->cached_dev_sectors);
+
+       /* PD controller */
+
+       int change = 0;
+       int64_t error;
+       int64_t dirty = atomic_long_read(&dc->disk.sectors_dirty);
+       int64_t derivative = dirty - dc->disk.sectors_dirty_last;
+
+       dc->disk.sectors_dirty_last = dirty;
+
+       derivative *= dc->writeback_rate_d_term;
+       derivative = clamp(derivative, -dirty, dirty);
+
+       derivative = ewma_add(dc->disk.sectors_dirty_derivative, derivative,
+                             dc->writeback_rate_d_smooth, 0);
+
+       /* Avoid divide by zero */
+       if (!target)
+               goto out;
+
+       error = div64_s64((dirty + derivative - target) << 8, target);
+
+       change = div_s64((dc->writeback_rate.rate * error) >> 8,
+                        dc->writeback_rate_p_term_inverse);
+
+       /* Don't increase writeback rate if the device isn't keeping up */
+       if (change > 0 &&
+           time_after64(local_clock(),
+                        dc->writeback_rate.next + 10 * NSEC_PER_MSEC))
+               change = 0;
+
+       dc->writeback_rate.rate =
+               clamp_t(int64_t, dc->writeback_rate.rate + change,
+                       1, NSEC_PER_MSEC);
+out:
+       dc->writeback_rate_derivative = derivative;
+       dc->writeback_rate_change = change;
+       dc->writeback_rate_target = target;
+
+       schedule_delayed_work(&dc->writeback_rate_update,
+                             dc->writeback_rate_update_seconds * HZ);
+}
+
+static void update_writeback_rate(struct work_struct *work)
+{
+       struct cached_dev *dc = container_of(to_delayed_work(work),
+                                            struct cached_dev,
+                                            writeback_rate_update);
+
+       down_read(&dc->writeback_lock);
+
+       if (atomic_read(&dc->has_dirty) &&
+           dc->writeback_percent)
+               __update_writeback_rate(dc);
+
+       up_read(&dc->writeback_lock);
+}
+
+static unsigned writeback_delay(struct cached_dev *dc, unsigned sectors)
+{
+       if (atomic_read(&dc->disk.detaching) ||
+           !dc->writeback_percent)
+               return 0;
+
+       return bch_next_delay(&dc->writeback_rate, sectors * 10000000ULL);
+}
+
+/* Background writeback */
+
+static bool dirty_pred(struct keybuf *buf, struct bkey *k)
+{
+       return KEY_DIRTY(k);
+}
+
+static void dirty_init(struct keybuf_key *w)
+{
+       struct dirty_io *io = w->private;
+       struct bio *bio = &io->bio;
+
+       bio_init(bio);
+       if (!io->dc->writeback_percent)
+               bio_set_prio(bio, IOPRIO_PRIO_VALUE(IOPRIO_CLASS_IDLE, 0));
+
+       bio->bi_size            = KEY_SIZE(&w->key) << 9;
+       bio->bi_max_vecs        = DIV_ROUND_UP(KEY_SIZE(&w->key), PAGE_SECTORS);
+       bio->bi_private         = w;
+       bio->bi_io_vec          = bio->bi_inline_vecs;
+       bch_bio_map(bio, NULL);
+}
+
+static void refill_dirty(struct closure *cl)
+{
+       struct cached_dev *dc = container_of(cl, struct cached_dev,
+                                            writeback.cl);
+       struct keybuf *buf = &dc->writeback_keys;
+       bool searched_from_start = false;
+       struct bkey end = MAX_KEY;
+       SET_KEY_INODE(&end, dc->disk.id);
+
+       if (!atomic_read(&dc->disk.detaching) &&
+           !dc->writeback_running)
+               closure_return(cl);
+
+       down_write(&dc->writeback_lock);
+
+       if (!atomic_read(&dc->has_dirty)) {
+               SET_BDEV_STATE(&dc->sb, BDEV_STATE_CLEAN);
+               bch_write_bdev_super(dc, NULL);
+
+               up_write(&dc->writeback_lock);
+               closure_return(cl);
+       }
+
+       if (bkey_cmp(&buf->last_scanned, &end) >= 0) {
+               buf->last_scanned = KEY(dc->disk.id, 0, 0);
+               searched_from_start = true;
+       }
+
+       bch_refill_keybuf(dc->disk.c, buf, &end);
+
+       if (bkey_cmp(&buf->last_scanned, &end) >= 0 && searched_from_start) {
+               /* Searched the entire btree  - delay awhile */
+
+               if (RB_EMPTY_ROOT(&buf->keys)) {
+                       atomic_set(&dc->has_dirty, 0);
+                       cached_dev_put(dc);
+               }
+
+               if (!atomic_read(&dc->disk.detaching))
+                       closure_delay(&dc->writeback, dc->writeback_delay * HZ);
+       }
+
+       up_write(&dc->writeback_lock);
+
+       ratelimit_reset(&dc->writeback_rate);
+
+       /* Punt to workqueue only so we don't recurse and blow the stack */
+       continue_at(cl, read_dirty, dirty_wq);
+}
+
+void bch_writeback_queue(struct cached_dev *dc)
+{
+       if (closure_trylock(&dc->writeback.cl, &dc->disk.cl)) {
+               if (!atomic_read(&dc->disk.detaching))
+                       closure_delay(&dc->writeback, dc->writeback_delay * HZ);
+
+               continue_at(&dc->writeback.cl, refill_dirty, dirty_wq);
+       }
+}
+
+void bch_writeback_add(struct cached_dev *dc, unsigned sectors)
+{
+       atomic_long_add(sectors, &dc->disk.sectors_dirty);
+
+       if (!atomic_read(&dc->has_dirty) &&
+           !atomic_xchg(&dc->has_dirty, 1)) {
+               atomic_inc(&dc->count);
+
+               if (BDEV_STATE(&dc->sb) != BDEV_STATE_DIRTY) {
+                       SET_BDEV_STATE(&dc->sb, BDEV_STATE_DIRTY);
+                       /* XXX: should do this synchronously */
+                       bch_write_bdev_super(dc, NULL);
+               }
+
+               bch_writeback_queue(dc);
+
+               if (dc->writeback_percent)
+                       schedule_delayed_work(&dc->writeback_rate_update,
+                                     dc->writeback_rate_update_seconds * HZ);
+       }
+}
+
+/* Background writeback - IO loop */
+
+static void dirty_io_destructor(struct closure *cl)
+{
+       struct dirty_io *io = container_of(cl, struct dirty_io, cl);
+       kfree(io);
+}
+
+static void write_dirty_finish(struct closure *cl)
+{
+       struct dirty_io *io = container_of(cl, struct dirty_io, cl);
+       struct keybuf_key *w = io->bio.bi_private;
+       struct cached_dev *dc = io->dc;
+       struct bio_vec *bv = bio_iovec_idx(&io->bio, io->bio.bi_vcnt);
+
+       while (bv-- != io->bio.bi_io_vec)
+               __free_page(bv->bv_page);
+
+       /* This is kind of a dumb way of signalling errors. */
+       if (KEY_DIRTY(&w->key)) {
+               unsigned i;
+               struct btree_op op;
+               bch_btree_op_init_stack(&op);
+
+               op.type = BTREE_REPLACE;
+               bkey_copy(&op.replace, &w->key);
+
+               SET_KEY_DIRTY(&w->key, false);
+               bch_keylist_add(&op.keys, &w->key);
+
+               for (i = 0; i < KEY_PTRS(&w->key); i++)
+                       atomic_inc(&PTR_BUCKET(dc->disk.c, &w->key, i)->pin);
+
+               pr_debug("clearing %s", pkey(&w->key));
+               bch_btree_insert(&op, dc->disk.c);
+               closure_sync(&op.cl);
+
+               atomic_long_inc(op.insert_collision
+                               ? &dc->disk.c->writeback_keys_failed
+                               : &dc->disk.c->writeback_keys_done);
+       }
+
+       bch_keybuf_del(&dc->writeback_keys, w);
+       atomic_dec_bug(&dc->in_flight);
+
+       closure_wake_up(&dc->writeback_wait);
+
+       closure_return_with_destructor(cl, dirty_io_destructor);
+}
+
+static void dirty_endio(struct bio *bio, int error)
+{
+       struct keybuf_key *w = bio->bi_private;
+       struct dirty_io *io = w->private;
+
+       if (error)
+               SET_KEY_DIRTY(&w->key, false);
+
+       closure_put(&io->cl);
+}
+
+static void write_dirty(struct closure *cl)
+{
+       struct dirty_io *io = container_of(cl, struct dirty_io, cl);
+       struct keybuf_key *w = io->bio.bi_private;
+
+       dirty_init(w);
+       io->bio.bi_rw           = WRITE;
+       io->bio.bi_sector       = KEY_START(&w->key);
+       io->bio.bi_bdev         = io->dc->bdev;
+       io->bio.bi_end_io       = dirty_endio;
+
+       trace_bcache_write_dirty(&io->bio);
+       closure_bio_submit(&io->bio, cl, &io->dc->disk);
+
+       continue_at(cl, write_dirty_finish, dirty_wq);
+}
+
+static void read_dirty_endio(struct bio *bio, int error)
+{
+       struct keybuf_key *w = bio->bi_private;
+       struct dirty_io *io = w->private;
+
+       bch_count_io_errors(PTR_CACHE(io->dc->disk.c, &w->key, 0),
+                           error, "reading dirty data from cache");
+
+       dirty_endio(bio, error);
+}
+
+static void read_dirty_submit(struct closure *cl)
+{
+       struct dirty_io *io = container_of(cl, struct dirty_io, cl);
+
+       trace_bcache_read_dirty(&io->bio);
+       closure_bio_submit(&io->bio, cl, &io->dc->disk);
+
+       continue_at(cl, write_dirty, dirty_wq);
+}
+
+static void read_dirty(struct closure *cl)
+{
+       struct cached_dev *dc = container_of(cl, struct cached_dev,
+                                            writeback.cl);
+       unsigned delay = writeback_delay(dc, 0);
+       struct keybuf_key *w;
+       struct dirty_io *io;
+
+       /*
+        * XXX: if we error, background writeback just spins. Should use some
+        * mempools.
+        */
+
+       while (1) {
+               w = bch_keybuf_next(&dc->writeback_keys);
+               if (!w)
+                       break;
+
+               BUG_ON(ptr_stale(dc->disk.c, &w->key, 0));
+
+               if (delay > 0 &&
+                   (KEY_START(&w->key) != dc->last_read ||
+                    jiffies_to_msecs(delay) > 50)) {
+                       w->private = NULL;
+
+                       closure_delay(&dc->writeback, delay);
+                       continue_at(cl, read_dirty, dirty_wq);
+               }
+
+               dc->last_read   = KEY_OFFSET(&w->key);
+
+               io = kzalloc(sizeof(struct dirty_io) + sizeof(struct bio_vec)
+                            * DIV_ROUND_UP(KEY_SIZE(&w->key), PAGE_SECTORS),
+                            GFP_KERNEL);
+               if (!io)
+                       goto err;
+
+               w->private      = io;
+               io->dc          = dc;
+
+               dirty_init(w);
+               io->bio.bi_sector       = PTR_OFFSET(&w->key, 0);
+               io->bio.bi_bdev         = PTR_CACHE(dc->disk.c,
+                                                   &w->key, 0)->bdev;
+               io->bio.bi_rw           = READ;
+               io->bio.bi_end_io       = read_dirty_endio;
+
+               if (bch_bio_alloc_pages(&io->bio, GFP_KERNEL))
+                       goto err_free;
+
+               pr_debug("%s", pkey(&w->key));
+
+               closure_call(&io->cl, read_dirty_submit, NULL, &dc->disk.cl);
+
+               delay = writeback_delay(dc, KEY_SIZE(&w->key));
+
+               atomic_inc(&dc->in_flight);
+
+               if (!closure_wait_event(&dc->writeback_wait, cl,
+                                       atomic_read(&dc->in_flight) < 64))
+                       continue_at(cl, read_dirty, dirty_wq);
+       }
+
+       if (0) {
+err_free:
+               kfree(w->private);
+err:
+               bch_keybuf_del(&dc->writeback_keys, w);
+       }
+
+       refill_dirty(cl);
+}
+
+void bch_writeback_init_cached_dev(struct cached_dev *dc)
+{
+       closure_init_unlocked(&dc->writeback);
+       init_rwsem(&dc->writeback_lock);
+
+       bch_keybuf_init(&dc->writeback_keys, dirty_pred);
+
+       dc->writeback_metadata          = true;
+       dc->writeback_running           = true;
+       dc->writeback_percent           = 10;
+       dc->writeback_delay             = 30;
+       dc->writeback_rate.rate         = 1024;
+
+       dc->writeback_rate_update_seconds = 30;
+       dc->writeback_rate_d_term       = 16;
+       dc->writeback_rate_p_term_inverse = 64;
+       dc->writeback_rate_d_smooth     = 8;
+
+       INIT_DELAYED_WORK(&dc->writeback_rate_update, update_writeback_rate);
+       schedule_delayed_work(&dc->writeback_rate_update,
+                             dc->writeback_rate_update_seconds * HZ);
+}
+
+void bch_writeback_exit(void)
+{
+       if (dirty_wq)
+               destroy_workqueue(dirty_wq);
+}
+
+int __init bch_writeback_init(void)
+{
+       dirty_wq = create_singlethread_workqueue("bcache_writeback");
+       if (!dirty_wq)
+               return -ENOMEM;
+
+       return 0;
+}
diff --git a/drivers/md/dm-crypt.c b/drivers/md/dm-crypt.c

index 13c15480d9401a79ea41fb44c603a007d97e2ea1..6d2d41ae9e322dbd53e787e5294f2d55551296eb 100644 (file)
--- a/drivers/md/dm-crypt.c
+++ b/drivers/md/dm-crypt.c
@@ -858,8 +858,7 @@ static void crypt_free_buffer_pages(struct crypt_config *cc, struct bio *clone)
         unsigned int i;
         struct bio_vec *bv;
  
-       for (i = 0; i < clone->bi_vcnt; i++) {
-               bv = bio_iovec_idx(clone, i);
+       bio_for_each_segment_all(bv, clone, i) {
                 BUG_ON(!bv->bv_page);
                 mempool_free(bv->bv_page, cc->page_pool);
                 bv->bv_page = NULL;
diff --git a/drivers/md/dm-raid1.c b/drivers/md/dm-raid1.c

index d053098c6a917f1ac55fec457ab7c7f590ecb1eb..699b5be68d319263cce75e8d932deb0be25c7d00 100644 (file)
--- a/drivers/md/dm-raid1.c
+++ b/drivers/md/dm-raid1.c
@@ -458,7 +458,7 @@ static void map_region(struct dm_io_region *io, struct mirror *m,
  {
         io->bdev = m->dev->bdev;
         io->sector = map_sector(m, bio);
-       io->count = bio->bi_size >> 9;
+       io->count = bio_sectors(bio);
  }
  
  static void hold_bio(struct mirror_set *ms, struct bio *bio)
diff --git a/drivers/md/dm-stripe.c b/drivers/md/dm-stripe.c

index d8837d313f5434183439f057f463c99be100e0ea..ea5e878a30b93b1f974d449738fc46b55d5eaeba 100644 (file)
--- a/drivers/md/dm-stripe.c
+++ b/drivers/md/dm-stripe.c
@@ -258,7 +258,7 @@ static int stripe_map_range(struct stripe_c *sc, struct bio *bio,
         sector_t begin, end;
  
         stripe_map_range_sector(sc, bio->bi_sector, target_stripe, &begin);
-       stripe_map_range_sector(sc, bio->bi_sector + bio_sectors(bio),
+       stripe_map_range_sector(sc, bio_end_sector(bio),
                                 target_stripe, &end);
         if (begin < end) {
                 bio->bi_bdev = sc->stripe[target_stripe].dev->bdev;
diff --git a/drivers/md/dm-verity.c b/drivers/md/dm-verity.c

index a746f1d21c661bec7b809c9a688fddf44284581f..b948fd864d457e9ce857d1d04fd74bf8997e0aee 100644 (file)
--- a/drivers/md/dm-verity.c
+++ b/drivers/md/dm-verity.c
@@ -501,7 +501,7 @@ static int verity_map(struct dm_target *ti, struct bio *bio)
                 return -EIO;
         }
  
-       if ((bio->bi_sector + bio_sectors(bio)) >>
+       if (bio_end_sector(bio) >>
             (v->data_dev_block_bits - SECTOR_SHIFT) > v->data_blocks) {
                 DMERR_LIMIT("io out of range");
                 return -EIO;
@@ -519,7 +519,7 @@ static int verity_map(struct dm_target *ti, struct bio *bio)
  
         bio->bi_end_io = verity_end_io;
         bio->bi_private = io;
-       io->io_vec_size = bio->bi_vcnt - bio->bi_idx;
+       io->io_vec_size = bio_segments(bio);
         if (io->io_vec_size < DM_VERITY_IO_VEC_INLINE)
                 io->io_vec = io->io_vec_inline;
         else
diff --git a/drivers/md/dm.c b/drivers/md/dm.c

index 9a0bdad9ad8fad49eaf8ee0d0ad916df247f0f3c..d5370a94b2c1308ece53dd50ca5f9a5011b25fc7 100644 (file)
--- a/drivers/md/dm.c
+++ b/drivers/md/dm.c
@@ -339,7 +339,7 @@ out:
         return md ? 0 : -ENXIO;
  }
  
-static int dm_blk_close(struct gendisk *disk, fmode_t mode)
+static void dm_blk_close(struct gendisk *disk, fmode_t mode)
  {
         struct mapped_device *md = disk->private_data;
  
@@ -349,8 +349,6 @@ static int dm_blk_close(struct gendisk *disk, fmode_t mode)
         dm_put(md);
  
         spin_unlock(&_minor_lock);
-
-       return 0;
  }
  
  int dm_open_count(struct mapped_device *md)
diff --git a/drivers/md/faulty.c b/drivers/md/faulty.c

index 5e7dc772f5deca223ea9142d073cc66785f54b27..3193aefe982b7b42badf4eba4adc36f89439d70c 100644 (file)
--- a/drivers/md/faulty.c
+++ b/drivers/md/faulty.c
@@ -185,8 +185,7 @@ static void make_request(struct mddev *mddev, struct bio *bio)
                         return;
                 }
  
-               if (check_sector(conf, bio->bi_sector, bio->bi_sector+(bio->bi_size>>9),
-                                WRITE))
+               if (check_sector(conf, bio->bi_sector, bio_end_sector(bio), WRITE))
                         failit = 1;
                 if (check_mode(conf, WritePersistent)) {
                         add_sector(conf, bio->bi_sector, WritePersistent);
@@ -196,8 +195,7 @@ static void make_request(struct mddev *mddev, struct bio *bio)
                         failit = 1;
         } else {
                 /* read request */
-               if (check_sector(conf, bio->bi_sector, bio->bi_sector + (bio->bi_size>>9),
-                                READ))
+               if (check_sector(conf, bio->bi_sector, bio_end_sector(bio), READ))
                         failit = 1;
                 if (check_mode(conf, ReadTransient))
                         failit = 1;
diff --git a/drivers/md/linear.c b/drivers/md/linear.c

index 21014836bdbf2286eba4fd02effbc4efcd9db625..f03fabd2b37bacf34a231a0bb034a6d8f2826e68 100644 (file)
--- a/drivers/md/linear.c
+++ b/drivers/md/linear.c
@@ -317,8 +317,7 @@ static void linear_make_request(struct mddev *mddev, struct bio *bio)
                 bio_io_error(bio);
                 return;
         }
-       if (unlikely(bio->bi_sector + (bio->bi_size >> 9) >
-                    tmp_dev->end_sector)) {
+       if (unlikely(bio_end_sector(bio) > tmp_dev->end_sector)) {
                 /* This bio crosses a device boundary, so we have to
                  * split it.
                  */
diff --git a/drivers/md/md.c b/drivers/md/md.c

index 4c74424c78b049068c0f118de398640781796c1b..681d1099a2d58936864b3b63610a31f38a908219 100644 (file)
--- a/drivers/md/md.c
+++ b/drivers/md/md.c
@@ -197,21 +197,12 @@ void md_trim_bio(struct bio *bio, int offset, int size)
         if (offset == 0 && size == bio->bi_size)
                 return;
  
-       bio->bi_sector += offset;
-       bio->bi_size = size;
-       offset <<= 9;
         clear_bit(BIO_SEG_VALID, &bio->bi_flags);
  
-       while (bio->bi_idx < bio->bi_vcnt &&
-              bio->bi_io_vec[bio->bi_idx].bv_len <= offset) {
-               /* remove this whole bio_vec */
-               offset -= bio->bi_io_vec[bio->bi_idx].bv_len;
-               bio->bi_idx++;
-       }
-       if (bio->bi_idx < bio->bi_vcnt) {
-               bio->bi_io_vec[bio->bi_idx].bv_offset += offset;
-               bio->bi_io_vec[bio->bi_idx].bv_len -= offset;
-       }
+       bio_advance(bio, offset << 9);
+
+       bio->bi_size = size;
+
         /* avoid any complications with bi_idx being non-zero*/
         if (bio->bi_idx) {
                 memmove(bio->bi_io_vec, bio->bi_io_vec+bio->bi_idx,
@@ -6674,15 +6665,13 @@ static int md_open(struct block_device *bdev, fmode_t mode)
         return err;
  }
  
-static int md_release(struct gendisk *disk, fmode_t mode)
+static void md_release(struct gendisk *disk, fmode_t mode)
  {
         struct mddev *mddev = disk->private_data;
  
         BUG_ON(!mddev);
         atomic_dec(&mddev->openers);
         mddev_put(mddev);
-
-       return 0;
  }
  
  static int md_media_changed(struct gendisk *disk)
diff --git a/drivers/md/raid0.c b/drivers/md/raid0.c

index 0505452de8d6ee2b3533c7930d62df544636b0c3..fcf65e512cf51a02413ae4439e72ce7ab6675159 100644 (file)
--- a/drivers/md/raid0.c
+++ b/drivers/md/raid0.c
@@ -502,11 +502,11 @@ static inline int is_io_in_chunk_boundary(struct mddev *mddev,
  {
         if (likely(is_power_of_2(chunk_sects))) {
                 return chunk_sects >= ((bio->bi_sector & (chunk_sects-1))
-                                       + (bio->bi_size >> 9));
+                                       + bio_sectors(bio));
         } else{
                 sector_t sector = bio->bi_sector;
                 return chunk_sects >= (sector_div(sector, chunk_sects)
-                                               + (bio->bi_size >> 9));
+                                               + bio_sectors(bio));
         }
  }
  
@@ -527,8 +527,7 @@ static void raid0_make_request(struct mddev *mddev, struct bio *bio)
                 sector_t sector = bio->bi_sector;
                 struct bio_pair *bp;
                 /* Sanity check -- queue functions should prevent this happening */
-               if ((bio->bi_vcnt != 1 && bio->bi_vcnt != 0) ||
-                   bio->bi_idx != 0)
+               if (bio_segments(bio) > 1)
                         goto bad_map;
                 /* This is a one page bio that upper layers
                  * refuse to split for us, so we need to split it.
@@ -567,7 +566,7 @@ bad_map:
         printk("md/raid0:%s: make_request bug: can't convert block across chunks"
                " or bigger than %dk %llu %d\n",
                mdname(mddev), chunk_sects / 2,
-              (unsigned long long)bio->bi_sector, bio->bi_size >> 10);
+              (unsigned long long)bio->bi_sector, bio_sectors(bio) / 2);
  
         bio_io_error(bio);
         return;
diff --git a/drivers/md/raid1.c b/drivers/md/raid1.c

index 851023e2ba5d5296824a46bdc12482056de648a1..55951182af73680d3b7f40d32cac1302062dbe74 100644 (file)
--- a/drivers/md/raid1.c
+++ b/drivers/md/raid1.c
@@ -92,7 +92,6 @@ static void r1bio_pool_free(void *r1_bio, void *data)
  static void * r1buf_pool_alloc(gfp_t gfp_flags, void *data)
  {
         struct pool_info *pi = data;
-       struct page *page;
         struct r1bio *r1_bio;
         struct bio *bio;
         int i, j;
@@ -122,14 +121,10 @@ static void * r1buf_pool_alloc(gfp_t gfp_flags, void *data)
                 j = 1;
         while(j--) {
                 bio = r1_bio->bios[j];
-               for (i = 0; i < RESYNC_PAGES; i++) {
-                       page = alloc_page(gfp_flags);
-                       if (unlikely(!page))
-                               goto out_free_pages;
+               bio->bi_vcnt = RESYNC_PAGES;
  
-                       bio->bi_io_vec[i].bv_page = page;
-                       bio->bi_vcnt = i+1;
-               }
+               if (bio_alloc_pages(bio, gfp_flags))
+                       goto out_free_bio;
         }
         /* If not user-requests, copy the page pointers to all bios */
         if (!test_bit(MD_RECOVERY_REQUESTED, &pi->mddev->recovery)) {
@@ -143,11 +138,6 @@ static void * r1buf_pool_alloc(gfp_t gfp_flags, void *data)
  
         return r1_bio;
  
-out_free_pages:
-       for (j=0 ; j < pi->raid_disks; j++)
-               for (i=0; i < r1_bio->bios[j]->bi_vcnt ; i++)
-                       put_page(r1_bio->bios[j]->bi_io_vec[i].bv_page);
-       j = -1;
  out_free_bio:
         while (++j < pi->raid_disks)
                 bio_put(r1_bio->bios[j]);
@@ -267,7 +257,7 @@ static void raid_end_bio_io(struct r1bio *r1_bio)
                          (bio_data_dir(bio) == WRITE) ? "write" : "read",
                          (unsigned long long) bio->bi_sector,
                          (unsigned long long) bio->bi_sector +
-                        (bio->bi_size >> 9) - 1);
+                        bio_sectors(bio) - 1);
  
                 call_bio_endio(r1_bio);
         }
@@ -458,7 +448,7 @@ static void raid1_end_write_request(struct bio *bio, int error)
                                          " %llu-%llu\n",
                                          (unsigned long long) mbio->bi_sector,
                                          (unsigned long long) mbio->bi_sector +
-                                        (mbio->bi_size >> 9) - 1);
+                                        bio_sectors(mbio) - 1);
                                 call_bio_endio(r1_bio);
                         }
                 }
@@ -925,7 +915,7 @@ static void alloc_behind_pages(struct bio *bio, struct r1bio *r1_bio)
         if (unlikely(!bvecs))
                 return;
  
-       bio_for_each_segment(bvec, bio, i) {
+       bio_for_each_segment_all(bvec, bio, i) {
                 bvecs[i] = *bvec;
                 bvecs[i].bv_page = alloc_page(GFP_NOIO);
                 if (unlikely(!bvecs[i].bv_page))
@@ -1023,7 +1013,7 @@ static void make_request(struct mddev *mddev, struct bio * bio)
         md_write_start(mddev, bio); /* wait on superblock update early */
  
         if (bio_data_dir(bio) == WRITE &&
-           bio->bi_sector + bio->bi_size/512 > mddev->suspend_lo &&
+           bio_end_sector(bio) > mddev->suspend_lo &&
             bio->bi_sector < mddev->suspend_hi) {
                 /* As the suspend_* range is controlled by
                  * userspace, we want an interruptible
@@ -1034,7 +1024,7 @@ static void make_request(struct mddev *mddev, struct bio * bio)
                         flush_signals(current);
                         prepare_to_wait(&conf->wait_barrier,
                                         &w, TASK_INTERRUPTIBLE);
-                       if (bio->bi_sector + bio->bi_size/512 <= mddev->suspend_lo ||
+                       if (bio_end_sector(bio) <= mddev->suspend_lo ||
                             bio->bi_sector >= mddev->suspend_hi)
                                 break;
                         schedule();
@@ -1054,7 +1044,7 @@ static void make_request(struct mddev *mddev, struct bio * bio)
         r1_bio = mempool_alloc(conf->r1bio_pool, GFP_NOIO);
  
         r1_bio->master_bio = bio;
-       r1_bio->sectors = bio->bi_size >> 9;
+       r1_bio->sectors = bio_sectors(bio);
         r1_bio->state = 0;
         r1_bio->mddev = mddev;
         r1_bio->sector = bio->bi_sector;
@@ -1132,7 +1122,7 @@ read_again:
                         r1_bio = mempool_alloc(conf->r1bio_pool, GFP_NOIO);
  
                         r1_bio->master_bio = bio;
-                       r1_bio->sectors = (bio->bi_size >> 9) - sectors_handled;
+                       r1_bio->sectors = bio_sectors(bio) - sectors_handled;
                         r1_bio->state = 0;
                         r1_bio->mddev = mddev;
                         r1_bio->sector = bio->bi_sector + sectors_handled;
@@ -1289,14 +1279,10 @@ read_again:
                         struct bio_vec *bvec;
                         int j;
  
-                       /* Yes, I really want the '__' version so that
-                        * we clear any unused pointer in the io_vec, rather
-                        * than leave them unchanged.  This is important
-                        * because when we come to free the pages, we won't
-                        * know the original bi_idx, so we just free
-                        * them all
+                       /*
+                        * We trimmed the bio, so _all is legit
                          */
-                       __bio_for_each_segment(bvec, mbio, j, 0)
+                       bio_for_each_segment_all(bvec, mbio, j)
                                 bvec->bv_page = r1_bio->behind_bvecs[j].bv_page;
                         if (test_bit(WriteMostly, &conf->mirrors[i].rdev->flags))
                                 atomic_inc(&r1_bio->behind_remaining);
@@ -1334,14 +1320,14 @@ read_again:
         /* Mustn't call r1_bio_write_done before this next test,
          * as it could result in the bio being freed.
          */
-       if (sectors_handled < (bio->bi_size >> 9)) {
+       if (sectors_handled < bio_sectors(bio)) {
                 r1_bio_write_done(r1_bio);
                 /* We need another r1_bio.  It has already been counted
                  * in bio->bi_phys_segments
                  */
                 r1_bio = mempool_alloc(conf->r1bio_pool, GFP_NOIO);
                 r1_bio->master_bio = bio;
-               r1_bio->sectors = (bio->bi_size >> 9) - sectors_handled;
+               r1_bio->sectors = bio_sectors(bio) - sectors_handled;
                 r1_bio->state = 0;
                 r1_bio->mddev = mddev;
                 r1_bio->sector = bio->bi_sector + sectors_handled;
@@ -1867,7 +1853,7 @@ static int process_checks(struct r1bio *r1_bio)
                 struct bio *sbio = r1_bio->bios[i];
                 int size;
  
-               if (r1_bio->bios[i]->bi_end_io != end_sync_read)
+               if (sbio->bi_end_io != end_sync_read)
                         continue;
  
                 if (test_bit(BIO_UPTODATE, &sbio->bi_flags)) {
@@ -1892,16 +1878,15 @@ static int process_checks(struct r1bio *r1_bio)
                         continue;
                 }
                 /* fixup the bio for reuse */
+               bio_reset(sbio);
                 sbio->bi_vcnt = vcnt;
                 sbio->bi_size = r1_bio->sectors << 9;
-               sbio->bi_idx = 0;
-               sbio->bi_phys_segments = 0;
-               sbio->bi_flags &= ~(BIO_POOL_MASK - 1);
-               sbio->bi_flags |= 1 << BIO_UPTODATE;
-               sbio->bi_next = NULL;
                 sbio->bi_sector = r1_bio->sector +
                         conf->mirrors[i].rdev->data_offset;
                 sbio->bi_bdev = conf->mirrors[i].rdev->bdev;
+               sbio->bi_end_io = end_sync_read;
+               sbio->bi_private = r1_bio;
+
                 size = sbio->bi_size;
                 for (j = 0; j < vcnt ; j++) {
                         struct bio_vec *bi;
@@ -1912,10 +1897,9 @@ static int process_checks(struct r1bio *r1_bio)
                         else
                                 bi->bv_len = size;
                         size -= PAGE_SIZE;
-                       memcpy(page_address(bi->bv_page),
-                              page_address(pbio->bi_io_vec[j].bv_page),
-                              PAGE_SIZE);
                 }
+
+               bio_copy_data(sbio, pbio);
         }
         return 0;
  }
@@ -1952,7 +1936,7 @@ static void sync_request_write(struct mddev *mddev, struct r1bio *r1_bio)
                 wbio->bi_rw = WRITE;
                 wbio->bi_end_io = end_sync_write;
                 atomic_inc(&r1_bio->remaining);
-               md_sync_acct(conf->mirrors[i].rdev->bdev, wbio->bi_size >> 9);
+               md_sync_acct(conf->mirrors[i].rdev->bdev, bio_sectors(wbio));
  
                 generic_make_request(wbio);
         }
@@ -2064,32 +2048,11 @@ static void fix_read_error(struct r1conf *conf, int read_disk,
         }
  }
  
-static void bi_complete(struct bio *bio, int error)
-{
-       complete((struct completion *)bio->bi_private);
-}
-
-static int submit_bio_wait(int rw, struct bio *bio)
-{
-       struct completion event;
-       rw |= REQ_SYNC;
-
-       init_completion(&event);
-       bio->bi_private = &event;
-       bio->bi_end_io = bi_complete;
-       submit_bio(rw, bio);
-       wait_for_completion(&event);
-
-       return test_bit(BIO_UPTODATE, &bio->bi_flags);
-}
-
  static int narrow_write_error(struct r1bio *r1_bio, int i)
  {
         struct mddev *mddev = r1_bio->mddev;
         struct r1conf *conf = mddev->private;
         struct md_rdev *rdev = conf->mirrors[i].rdev;
-       int vcnt, idx;
-       struct bio_vec *vec;
  
         /* bio has the data to be written to device 'i' where
          * we just recently had a write error.
@@ -2117,30 +2080,32 @@ static int narrow_write_error(struct r1bio *r1_bio, int i)
                    & ~(sector_t)(block_sectors - 1))
                 - sector;
  
-       if (test_bit(R1BIO_BehindIO, &r1_bio->state)) {
-               vcnt = r1_bio->behind_page_count;
-               vec = r1_bio->behind_bvecs;
-               idx = 0;
-               while (vec[idx].bv_page == NULL)
-                       idx++;
-       } else {
-               vcnt = r1_bio->master_bio->bi_vcnt;
-               vec = r1_bio->master_bio->bi_io_vec;
-               idx = r1_bio->master_bio->bi_idx;
-       }
         while (sect_to_write) {
                 struct bio *wbio;
                 if (sectors > sect_to_write)
                         sectors = sect_to_write;
                 /* Write at 'sector' for 'sectors'*/
  
-               wbio = bio_alloc_mddev(GFP_NOIO, vcnt, mddev);
-               memcpy(wbio->bi_io_vec, vec, vcnt * sizeof(struct bio_vec));
-               wbio->bi_sector = r1_bio->sector;
+               if (test_bit(R1BIO_BehindIO, &r1_bio->state)) {
+                       unsigned vcnt = r1_bio->behind_page_count;
+                       struct bio_vec *vec = r1_bio->behind_bvecs;
+
+                       while (!vec->bv_page) {
+                               vec++;
+                               vcnt--;
+                       }
+
+                       wbio = bio_alloc_mddev(GFP_NOIO, vcnt, mddev);
+                       memcpy(wbio->bi_io_vec, vec, vcnt * sizeof(struct bio_vec));
+
+                       wbio->bi_vcnt = vcnt;
+               } else {
+                       wbio = bio_clone_mddev(r1_bio->master_bio, GFP_NOIO, mddev);
+               }
+
                 wbio->bi_rw = WRITE;
-               wbio->bi_vcnt = vcnt;
+               wbio->bi_sector = r1_bio->sector;
                 wbio->bi_size = r1_bio->sectors << 9;
-               wbio->bi_idx = idx;
  
                 md_trim_bio(wbio, sector - r1_bio->sector, sectors);
                 wbio->bi_sector += rdev->data_offset;
@@ -2289,8 +2254,7 @@ read_more:
                         r1_bio = mempool_alloc(conf->r1bio_pool, GFP_NOIO);
  
                         r1_bio->master_bio = mbio;
-                       r1_bio->sectors = (mbio->bi_size >> 9)
-                                         - sectors_handled;
+                       r1_bio->sectors = bio_sectors(mbio) - sectors_handled;
                         r1_bio->state = 0;
                         set_bit(R1BIO_ReadError, &r1_bio->state);
                         r1_bio->mddev = mddev;
@@ -2464,18 +2428,7 @@ static sector_t sync_request(struct mddev *mddev, sector_t sector_nr, int *skipp
         for (i = 0; i < conf->raid_disks * 2; i++) {
                 struct md_rdev *rdev;
                 bio = r1_bio->bios[i];
-
-               /* take from bio_init */
-               bio->bi_next = NULL;
-               bio->bi_flags &= ~(BIO_POOL_MASK-1);
-               bio->bi_flags |= 1 << BIO_UPTODATE;
-               bio->bi_rw = READ;
-               bio->bi_vcnt = 0;
-               bio->bi_idx = 0;
-               bio->bi_phys_segments = 0;
-               bio->bi_size = 0;
-               bio->bi_end_io = NULL;
-               bio->bi_private = NULL;
+               bio_reset(bio);
  
                 rdev = rcu_dereference(conf->mirrors[i].rdev);
                 if (rdev == NULL ||
diff --git a/drivers/md/raid10.c b/drivers/md/raid10.c

index 018741ba93104d9ad432d7524a136cf7e69b2227..59d4daa5f4c7a32c245ef954f24650fe75084117 100644 (file)
--- a/drivers/md/raid10.c
+++ b/drivers/md/raid10.c
@@ -1174,14 +1174,13 @@ static void make_request(struct mddev *mddev, struct bio * bio)
         /* If this request crosses a chunk boundary, we need to
          * split it.  This will only happen for 1 PAGE (or less) requests.
          */
-       if (unlikely((bio->bi_sector & chunk_mask) + (bio->bi_size >> 9)
+       if (unlikely((bio->bi_sector & chunk_mask) + bio_sectors(bio)
                      > chunk_sects
                      && (conf->geo.near_copies < conf->geo.raid_disks
                          || conf->prev.near_copies < conf->prev.raid_disks))) {
                 struct bio_pair *bp;
                 /* Sanity check -- queue functions should prevent this happening */
-               if ((bio->bi_vcnt != 1 && bio->bi_vcnt != 0) ||
-                   bio->bi_idx != 0)
+               if (bio_segments(bio) > 1)
                         goto bad_map;
                 /* This is a one page bio that upper layers
                  * refuse to split for us, so we need to split it.
@@ -1214,7 +1213,7 @@ static void make_request(struct mddev *mddev, struct bio * bio)
         bad_map:
                 printk("md/raid10:%s: make_request bug: can't convert block across chunks"
                        " or bigger than %dk %llu %d\n", mdname(mddev), chunk_sects/2,
-                      (unsigned long long)bio->bi_sector, bio->bi_size >> 10);
+                      (unsigned long long)bio->bi_sector, bio_sectors(bio) / 2);
  
                 bio_io_error(bio);
                 return;
@@ -1229,7 +1228,7 @@ static void make_request(struct mddev *mddev, struct bio * bio)
          */
         wait_barrier(conf);
  
-       sectors = bio->bi_size >> 9;
+       sectors = bio_sectors(bio);
         while (test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery) &&
             bio->bi_sector < conf->reshape_progress &&
             bio->bi_sector + sectors > conf->reshape_progress) {
@@ -1331,8 +1330,7 @@ read_again:
                         r10_bio = mempool_alloc(conf->r10bio_pool, GFP_NOIO);
  
                         r10_bio->master_bio = bio;
-                       r10_bio->sectors = ((bio->bi_size >> 9)
-                                           - sectors_handled);
+                       r10_bio->sectors = bio_sectors(bio) - sectors_handled;
                         r10_bio->state = 0;
                         r10_bio->mddev = mddev;
                         r10_bio->sector = bio->bi_sector + sectors_handled;
@@ -1574,7 +1572,7 @@ retry_write:
          * after checking if we need to go around again.
          */
  
-       if (sectors_handled < (bio->bi_size >> 9)) {
+       if (sectors_handled < bio_sectors(bio)) {
                 one_write_done(r10_bio);
                 /* We need another r10_bio.  It has already been counted
                  * in bio->bi_phys_segments.
@@ -1582,7 +1580,7 @@ retry_write:
                 r10_bio = mempool_alloc(conf->r10bio_pool, GFP_NOIO);
  
                 r10_bio->master_bio = bio;
-               r10_bio->sectors = (bio->bi_size >> 9) - sectors_handled;
+               r10_bio->sectors = bio_sectors(bio) - sectors_handled;
  
                 r10_bio->mddev = mddev;
                 r10_bio->sector = bio->bi_sector + sectors_handled;
@@ -2084,13 +2082,10 @@ static void sync_request_write(struct mddev *mddev, struct r10bio *r10_bio)
                  * First we need to fixup bv_offset, bv_len and
                  * bi_vecs, as the read request might have corrupted these
                  */
+               bio_reset(tbio);
+
                 tbio->bi_vcnt = vcnt;
                 tbio->bi_size = r10_bio->sectors << 9;
-               tbio->bi_idx = 0;
-               tbio->bi_phys_segments = 0;
-               tbio->bi_flags &= ~(BIO_POOL_MASK - 1);
-               tbio->bi_flags |= 1 << BIO_UPTODATE;
-               tbio->bi_next = NULL;
                 tbio->bi_rw = WRITE;
                 tbio->bi_private = r10_bio;
                 tbio->bi_sector = r10_bio->devs[i].addr;
@@ -2108,7 +2103,7 @@ static void sync_request_write(struct mddev *mddev, struct r10bio *r10_bio)
                 d = r10_bio->devs[i].devnum;
                 atomic_inc(&conf->mirrors[d].rdev->nr_pending);
                 atomic_inc(&r10_bio->remaining);
-               md_sync_acct(conf->mirrors[d].rdev->bdev, tbio->bi_size >> 9);
+               md_sync_acct(conf->mirrors[d].rdev->bdev, bio_sectors(tbio));
  
                 tbio->bi_sector += conf->mirrors[d].rdev->data_offset;
                 tbio->bi_bdev = conf->mirrors[d].rdev->bdev;
@@ -2133,7 +2128,7 @@ static void sync_request_write(struct mddev *mddev, struct r10bio *r10_bio)
                 d = r10_bio->devs[i].devnum;
                 atomic_inc(&r10_bio->remaining);
                 md_sync_acct(conf->mirrors[d].replacement->bdev,
-                            tbio->bi_size >> 9);
+                            bio_sectors(tbio));
                 generic_make_request(tbio);
         }
  
@@ -2259,13 +2254,13 @@ static void recovery_request_write(struct mddev *mddev, struct r10bio *r10_bio)
         wbio2 = r10_bio->devs[1].repl_bio;
         if (wbio->bi_end_io) {
                 atomic_inc(&conf->mirrors[d].rdev->nr_pending);
-               md_sync_acct(conf->mirrors[d].rdev->bdev, wbio->bi_size >> 9);
+               md_sync_acct(conf->mirrors[d].rdev->bdev, bio_sectors(wbio));
                 generic_make_request(wbio);
         }
         if (wbio2 && wbio2->bi_end_io) {
                 atomic_inc(&conf->mirrors[d].replacement->nr_pending);
                 md_sync_acct(conf->mirrors[d].replacement->bdev,
-                            wbio2->bi_size >> 9);
+                            bio_sectors(wbio2));
                 generic_make_request(wbio2);
         }
  }
@@ -2536,25 +2531,6 @@ static void fix_read_error(struct r10conf *conf, struct mddev *mddev, struct r10
         }
  }
  
-static void bi_complete(struct bio *bio, int error)
-{
-       complete((struct completion *)bio->bi_private);
-}
-
-static int submit_bio_wait(int rw, struct bio *bio)
-{
-       struct completion event;
-       rw |= REQ_SYNC;
-
-       init_completion(&event);
-       bio->bi_private = &event;
-       bio->bi_end_io = bi_complete;
-       submit_bio(rw, bio);
-       wait_for_completion(&event);
-
-       return test_bit(BIO_UPTODATE, &bio->bi_flags);
-}
-
  static int narrow_write_error(struct r10bio *r10_bio, int i)
  {
         struct bio *bio = r10_bio->master_bio;
@@ -2695,8 +2671,7 @@ read_more:
                 r10_bio = mempool_alloc(conf->r10bio_pool,
                                         GFP_NOIO);
                 r10_bio->master_bio = mbio;
-               r10_bio->sectors = (mbio->bi_size >> 9)
-                       - sectors_handled;
+               r10_bio->sectors = bio_sectors(mbio) - sectors_handled;
                 r10_bio->state = 0;
                 set_bit(R10BIO_ReadError,
                         &r10_bio->state);
@@ -3133,6 +3108,7 @@ static sector_t sync_request(struct mddev *mddev, sector_t sector_nr,
                                         }
                                 }
                                 bio = r10_bio->devs[0].bio;
+                               bio_reset(bio);
                                 bio->bi_next = biolist;
                                 biolist = bio;
                                 bio->bi_private = r10_bio;
@@ -3157,6 +3133,7 @@ static sector_t sync_request(struct mddev *mddev, sector_t sector_nr,
                                 rdev = mirror->rdev;
                                 if (!test_bit(In_sync, &rdev->flags)) {
                                         bio = r10_bio->devs[1].bio;
+                                       bio_reset(bio);
                                         bio->bi_next = biolist;
                                         biolist = bio;
                                         bio->bi_private = r10_bio;
@@ -3185,6 +3162,7 @@ static sector_t sync_request(struct mddev *mddev, sector_t sector_nr,
                                 if (rdev == NULL || bio == NULL ||
                                     test_bit(Faulty, &rdev->flags))
                                         break;
+                               bio_reset(bio);
                                 bio->bi_next = biolist;
                                 biolist = bio;
                                 bio->bi_private = r10_bio;
@@ -3283,7 +3261,7 @@ static sector_t sync_request(struct mddev *mddev, sector_t sector_nr,
                                 r10_bio->devs[i].repl_bio->bi_end_io = NULL;
  
                         bio = r10_bio->devs[i].bio;
-                       bio->bi_end_io = NULL;
+                       bio_reset(bio);
                         clear_bit(BIO_UPTODATE, &bio->bi_flags);
                         if (conf->mirrors[d].rdev == NULL ||
                             test_bit(Faulty, &conf->mirrors[d].rdev->flags))
@@ -3320,6 +3298,7 @@ static sector_t sync_request(struct mddev *mddev, sector_t sector_nr,
  
                         /* Need to set up for writing to the replacement */
                         bio = r10_bio->devs[i].repl_bio;
+                       bio_reset(bio);
                         clear_bit(BIO_UPTODATE, &bio->bi_flags);
  
                         sector = r10_bio->devs[i].addr;
@@ -3353,17 +3332,6 @@ static sector_t sync_request(struct mddev *mddev, sector_t sector_nr,
                 }
         }
  
-       for (bio = biolist; bio ; bio=bio->bi_next) {
-
-               bio->bi_flags &= ~(BIO_POOL_MASK - 1);
-               if (bio->bi_end_io)
-                       bio->bi_flags |= 1 << BIO_UPTODATE;
-               bio->bi_vcnt = 0;
-               bio->bi_idx = 0;
-               bio->bi_phys_segments = 0;
-               bio->bi_size = 0;
-       }
-
         nr_sectors = 0;
         if (sector_nr + max_sync < max_sector)
                 max_sector = sector_nr + max_sync;
@@ -4411,7 +4379,6 @@ read_more:
         read_bio->bi_flags &= ~(BIO_POOL_MASK - 1);
         read_bio->bi_flags |= 1 << BIO_UPTODATE;
         read_bio->bi_vcnt = 0;
-       read_bio->bi_idx = 0;
         read_bio->bi_size = 0;
         r10_bio->master_bio = read_bio;
         r10_bio->read_slot = r10_bio->devs[r10_bio->read_slot].devnum;
@@ -4435,17 +4402,14 @@ read_more:
                 }
                 if (!rdev2 || test_bit(Faulty, &rdev2->flags))
                         continue;
+
+               bio_reset(b);
                 b->bi_bdev = rdev2->bdev;
                 b->bi_sector = r10_bio->devs[s/2].addr + rdev2->new_data_offset;
                 b->bi_private = r10_bio;
                 b->bi_end_io = end_reshape_write;
                 b->bi_rw = WRITE;
-               b->bi_flags &= ~(BIO_POOL_MASK - 1);
-               b->bi_flags |= 1 << BIO_UPTODATE;
                 b->bi_next = blist;
-               b->bi_vcnt = 0;
-               b->bi_idx = 0;
-               b->bi_size = 0;
                 blist = b;
         }
  
diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c

index 4a7be455d6d86ceb6bda86a332b81d036db52dee..9359828ffe264d3313ee77de993ea4c5147f1205 100644 (file)
--- a/drivers/md/raid5.c
+++ b/drivers/md/raid5.c
@@ -90,7 +90,7 @@ static inline struct hlist_head *stripe_hash(struct r5conf *conf, sector_t sect)
   */
  static inline struct bio *r5_next_bio(struct bio *bio, sector_t sector)
  {
-       int sectors = bio->bi_size >> 9;
+       int sectors = bio_sectors(bio);
         if (bio->bi_sector + sectors < sector + STRIPE_SECTORS)
                 return bio->bi_next;
         else
@@ -569,14 +569,6 @@ static void ops_run_io(struct stripe_head *sh, struct stripe_head_state *s)
                 bi = &sh->dev[i].req;
                 rbi = &sh->dev[i].rreq; /* For writing to replacement */
  
-               bi->bi_rw = rw;
-               rbi->bi_rw = rw;
-               if (rw & WRITE) {
-                       bi->bi_end_io = raid5_end_write_request;
-                       rbi->bi_end_io = raid5_end_write_request;
-               } else
-                       bi->bi_end_io = raid5_end_read_request;
-
                 rcu_read_lock();
                 rrdev = rcu_dereference(conf->disks[i].replacement);
                 smp_mb(); /* Ensure that if rrdev is NULL, rdev won't be */
@@ -651,7 +643,14 @@ static void ops_run_io(struct stripe_head *sh, struct stripe_head_state *s)
  
                         set_bit(STRIPE_IO_STARTED, &sh->state);
  
+                       bio_reset(bi);
                         bi->bi_bdev = rdev->bdev;
+                       bi->bi_rw = rw;
+                       bi->bi_end_io = (rw & WRITE)
+                               ? raid5_end_write_request
+                               : raid5_end_read_request;
+                       bi->bi_private = sh;
+
                         pr_debug("%s: for %llu schedule op %ld on disc %d\n",
                                 __func__, (unsigned long long)sh->sector,
                                 bi->bi_rw, i);
@@ -665,12 +664,9 @@ static void ops_run_io(struct stripe_head *sh, struct stripe_head_state *s)
                         if (test_bit(R5_ReadNoMerge, &sh->dev[i].flags))
                                 bi->bi_rw |= REQ_FLUSH;
  
-                       bi->bi_flags = 1 << BIO_UPTODATE;
-                       bi->bi_idx = 0;
                         bi->bi_io_vec[0].bv_len = STRIPE_SIZE;
                         bi->bi_io_vec[0].bv_offset = 0;
                         bi->bi_size = STRIPE_SIZE;
-                       bi->bi_next = NULL;
                         if (rrdev)
                                 set_bit(R5_DOUBLE_LOCKED, &sh->dev[i].flags);
  
@@ -687,7 +683,13 @@ static void ops_run_io(struct stripe_head *sh, struct stripe_head_state *s)
  
                         set_bit(STRIPE_IO_STARTED, &sh->state);
  
+                       bio_reset(rbi);
                         rbi->bi_bdev = rrdev->bdev;
+                       rbi->bi_rw = rw;
+                       BUG_ON(!(rw & WRITE));
+                       rbi->bi_end_io = raid5_end_write_request;
+                       rbi->bi_private = sh;
+
                         pr_debug("%s: for %llu schedule op %ld on "
                                  "replacement disc %d\n",
                                 __func__, (unsigned long long)sh->sector,
@@ -699,12 +701,9 @@ static void ops_run_io(struct stripe_head *sh, struct stripe_head_state *s)
                         else
                                 rbi->bi_sector = (sh->sector
                                                   + rrdev->data_offset);
-                       rbi->bi_flags = 1 << BIO_UPTODATE;
-                       rbi->bi_idx = 0;
                         rbi->bi_io_vec[0].bv_len = STRIPE_SIZE;
                         rbi->bi_io_vec[0].bv_offset = 0;
                         rbi->bi_size = STRIPE_SIZE;
-                       rbi->bi_next = NULL;
                         if (conf->mddev->gendisk)
                                 trace_block_bio_remap(bdev_get_queue(rbi->bi_bdev),
                                                       rbi, disk_devt(conf->mddev->gendisk),
@@ -2402,11 +2401,11 @@ static int add_stripe_bio(struct stripe_head *sh, struct bio *bi, int dd_idx, in
         } else
                 bip = &sh->dev[dd_idx].toread;
         while (*bip && (*bip)->bi_sector < bi->bi_sector) {
-               if ((*bip)->bi_sector + ((*bip)->bi_size >> 9) > bi->bi_sector)
+               if (bio_end_sector(*bip) > bi->bi_sector)
                         goto overlap;
                 bip = & (*bip)->bi_next;
         }
-       if (*bip && (*bip)->bi_sector < bi->bi_sector + ((bi->bi_size)>>9))
+       if (*bip && (*bip)->bi_sector < bio_end_sector(bi))
                 goto overlap;
  
         BUG_ON(*bip && bi->bi_next && (*bip) != bi->bi_next);
@@ -2422,8 +2421,8 @@ static int add_stripe_bio(struct stripe_head *sh, struct bio *bi, int dd_idx, in
                      sector < sh->dev[dd_idx].sector + STRIPE_SECTORS &&
                              bi && bi->bi_sector <= sector;
                      bi = r5_next_bio(bi, sh->dev[dd_idx].sector)) {
-                       if (bi->bi_sector + (bi->bi_size>>9) >= sector)
-                               sector = bi->bi_sector + (bi->bi_size>>9);
+                       if (bio_end_sector(bi) >= sector)
+                               sector = bio_end_sector(bi);
                 }
                 if (sector >= sh->dev[dd_idx].sector + STRIPE_SECTORS)
                         set_bit(R5_OVERWRITE, &sh->dev[dd_idx].flags);
@@ -3849,7 +3848,7 @@ static int in_chunk_boundary(struct mddev *mddev, struct bio *bio)
  {
         sector_t sector = bio->bi_sector + get_start_sect(bio->bi_bdev);
         unsigned int chunk_sectors = mddev->chunk_sectors;
-       unsigned int bio_sectors = bio->bi_size >> 9;
+       unsigned int bio_sectors = bio_sectors(bio);
  
         if (mddev->new_chunk_sectors < mddev->chunk_sectors)
                 chunk_sectors = mddev->new_chunk_sectors;
@@ -3941,7 +3940,7 @@ static int bio_fits_rdev(struct bio *bi)
  {
         struct request_queue *q = bdev_get_queue(bi->bi_bdev);
  
-       if ((bi->bi_size>>9) > queue_max_sectors(q))
+       if (bio_sectors(bi) > queue_max_sectors(q))
                 return 0;
         blk_recount_segments(q, bi);
         if (bi->bi_phys_segments > queue_max_segments(q))
@@ -3988,7 +3987,7 @@ static int chunk_aligned_read(struct mddev *mddev, struct bio * raid_bio)
                                                     0,
                                                     &dd_idx, NULL);
  
-       end_sector = align_bi->bi_sector + (align_bi->bi_size >> 9);
+       end_sector = bio_end_sector(align_bi);
         rcu_read_lock();
         rdev = rcu_dereference(conf->disks[dd_idx].replacement);
         if (!rdev || test_bit(Faulty, &rdev->flags) ||
@@ -4011,7 +4010,7 @@ static int chunk_aligned_read(struct mddev *mddev, struct bio * raid_bio)
                 align_bi->bi_flags &= ~(1 << BIO_SEG_VALID);
  
                 if (!bio_fits_rdev(align_bi) ||
-                   is_badblock(rdev, align_bi->bi_sector, align_bi->bi_size>>9,
+                   is_badblock(rdev, align_bi->bi_sector, bio_sectors(align_bi),
                                 &first_bad, &bad_sectors)) {
                         /* too big in some way, or has a known bad block */
                         bio_put(align_bi);
@@ -4273,7 +4272,7 @@ static void make_request(struct mddev *mddev, struct bio * bi)
         }
  
         logical_sector = bi->bi_sector & ~((sector_t)STRIPE_SECTORS-1);
-       last_sector = bi->bi_sector + (bi->bi_size>>9);
+       last_sector = bio_end_sector(bi);
         bi->bi_next = NULL;
         bi->bi_phys_segments = 1;       /* over-loaded to count active stripes */
  
@@ -4739,7 +4738,7 @@ static int  retry_aligned_read(struct r5conf *conf, struct bio *raid_bio)
         logical_sector = raid_bio->bi_sector & ~((sector_t)STRIPE_SECTORS-1);
         sector = raid5_compute_sector(conf, logical_sector,
                                       0, &dd_idx, NULL);
-       last_sector = raid_bio->bi_sector + (raid_bio->bi_size>>9);
+       last_sector = bio_end_sector(raid_bio);
  
         for (; logical_sector < last_sector;
              logical_sector += STRIPE_SECTORS,
diff --git a/drivers/memstick/core/mspro_block.c b/drivers/memstick/core/mspro_block.c

index f12b78dbce0426ba28fb156f1cf48c412fa75196..f4176ca3a794ee2c684d35e3461da165b1abc7ce 100644 (file)
--- a/drivers/memstick/core/mspro_block.c
+++ b/drivers/memstick/core/mspro_block.c
@@ -204,7 +204,7 @@ static int mspro_block_bd_open(struct block_device *bdev, fmode_t mode)
  }
  
  
-static int mspro_block_disk_release(struct gendisk *disk)
+static void mspro_block_disk_release(struct gendisk *disk)
  {
         struct mspro_block_data *msb = disk->private_data;
         int disk_id = MINOR(disk_devt(disk)) >> MSPRO_BLOCK_PART_SHIFT;
@@ -224,13 +224,11 @@ static int mspro_block_disk_release(struct gendisk *disk)
         }
  
         mutex_unlock(&mspro_block_disk_lock);
-
-       return 0;
  }
  
-static int mspro_block_bd_release(struct gendisk *disk, fmode_t mode)
+static void mspro_block_bd_release(struct gendisk *disk, fmode_t mode)
  {
-       return mspro_block_disk_release(disk);
+       mspro_block_disk_release(disk);
  }
  
  static int mspro_block_bd_getgeo(struct block_device *bdev,
diff --git a/drivers/message/fusion/mptsas.c b/drivers/message/fusion/mptsas.c

index ffee6f781e30f6a25537bba1ed44f379aee137fd..dd239bdbfcb4a0877db2ab49aa3c27a81eec7dd1 100644 (file)
--- a/drivers/message/fusion/mptsas.c
+++ b/drivers/message/fusion/mptsas.c
@@ -2235,10 +2235,10 @@ static int mptsas_smp_handler(struct Scsi_Host *shost, struct sas_rphy *rphy,
         }
  
         /* do we need to support multiple segments? */
-       if (req->bio->bi_vcnt > 1 || rsp->bio->bi_vcnt > 1) {
+       if (bio_segments(req->bio) > 1 || bio_segments(rsp->bio) > 1) {
                 printk(MYIOC_s_ERR_FMT "%s: multiple segments req %u %u, rsp %u %u\n",
-                   ioc->name, __func__, req->bio->bi_vcnt, blk_rq_bytes(req),
-                   rsp->bio->bi_vcnt, blk_rq_bytes(rsp));
+                   ioc->name, __func__, bio_segments(req->bio), blk_rq_bytes(req),
+                   bio_segments(rsp->bio), blk_rq_bytes(rsp));
                 return -EINVAL;
         }
  
diff --git a/drivers/message/i2o/i2o_block.c b/drivers/message/i2o/i2o_block.c

index 49e86aed2bc4f13d05ffbf5a2d094b5a979372cc..6fc3866965df9713d80c1c0bbc917a2c5fd603a7 100644 (file)
--- a/drivers/message/i2o/i2o_block.c
+++ b/drivers/message/i2o/i2o_block.c
@@ -600,10 +600,8 @@ static int i2o_block_open(struct block_device *bdev, fmode_t mode)
   *
   *     Unlock and unmount the media, and power down the device. Gets called if
   *     the block device is closed.
- *
- *     Returns 0 on success or negative error code on failure.
   */
-static int i2o_block_release(struct gendisk *disk, fmode_t mode)
+static void i2o_block_release(struct gendisk *disk, fmode_t mode)
  {
         struct i2o_block_device *dev = disk->private_data;
         u8 operation;
@@ -617,7 +615,7 @@ static int i2o_block_release(struct gendisk *disk, fmode_t mode)
          * the TID no longer exists.
          */
         if (!dev->i2o_dev)
-               return 0;
+               return;
  
         mutex_lock(&i2o_block_mutex);
         i2o_block_device_flush(dev->i2o_dev);
@@ -631,8 +629,6 @@ static int i2o_block_release(struct gendisk *disk, fmode_t mode)
  
         i2o_block_device_power(dev, operation);
         mutex_unlock(&i2o_block_mutex);
-
-       return 0;
  }
  
  static int i2o_block_getgeo(struct block_device *bdev, struct hd_geometry *geo)
diff --git a/drivers/mmc/card/block.c b/drivers/mmc/card/block.c

index e12a03cc2a6e924e6ef7cb2597a91c0383f8afdb..dd27b0783d5213aa5d7d2aaab8d19258ad8fa0a7 100644 (file)
--- a/drivers/mmc/card/block.c
+++ b/drivers/mmc/card/block.c
@@ -304,14 +304,13 @@ static int mmc_blk_open(struct block_device *bdev, fmode_t mode)
         return ret;
  }
  
-static int mmc_blk_release(struct gendisk *disk, fmode_t mode)
+static void mmc_blk_release(struct gendisk *disk, fmode_t mode)
  {
         struct mmc_blk_data *md = disk->private_data;
  
         mutex_lock(&block_mutex);
         mmc_blk_put(md);
         mutex_unlock(&block_mutex);
-       return 0;
  }
  
  static int
diff --git a/drivers/mtd/Kconfig b/drivers/mtd/Kconfig

index 557bec599f4f6741255a3f6c16f6aff0c6d4585b..5fab4e6e83013c033c01d2465b4f25c756f39dce 100644 (file)
--- a/drivers/mtd/Kconfig
+++ b/drivers/mtd/Kconfig
@@ -157,19 +157,6 @@ config MTD_BCM47XX_PARTS
  
  comment "User Modules And Translation Layers"
  
-config MTD_CHAR
-       tristate "Direct char device access to MTD devices"
-       help
-         This provides a character device for each MTD device present in
-         the system, allowing the user to read and write directly to the
-         memory chips, and also use ioctl() to obtain information about
-         the device, or to erase parts of it.
-
-config HAVE_MTD_OTP
-       bool
-       help
-         Enable access to OTP regions using MTD_CHAR.
-
  config MTD_BLKDEVS
         tristate "Common interface to block layer for MTD 'translation layers'"
         depends on BLOCK
diff --git a/drivers/mtd/Makefile b/drivers/mtd/Makefile

index 18a38e55b2f0fec9cc3eb7466cf51568862e7d70..4cfb31e6c966ad37d1384d59ccd12d572961ea93 100644 (file)
--- a/drivers/mtd/Makefile
+++ b/drivers/mtd/Makefile
@@ -4,7 +4,7 @@
  
  # Core functionality.
  obj-$(CONFIG_MTD)              += mtd.o
-mtd-y                          := mtdcore.o mtdsuper.o mtdconcat.o mtdpart.o
+mtd-y                          := mtdcore.o mtdsuper.o mtdconcat.o mtdpart.o mtdchar.o
  
  obj-$(CONFIG_MTD_OF_PARTS)     += ofpart.o
  obj-$(CONFIG_MTD_REDBOOT_PARTS) += redboot.o
@@ -15,7 +15,6 @@ obj-$(CONFIG_MTD_BCM63XX_PARTS)       += bcm63xxpart.o
  obj-$(CONFIG_MTD_BCM47XX_PARTS)        += bcm47xxpart.o
  
  # 'Users' - code which presents functionality to userspace.
-obj-$(CONFIG_MTD_CHAR)         += mtdchar.o
  obj-$(CONFIG_MTD_BLKDEVS)      += mtd_blkdevs.o
  obj-$(CONFIG_MTD_BLOCK)                += mtdblock.o
  obj-$(CONFIG_MTD_BLOCK_RO)     += mtdblock_ro.o
diff --git a/drivers/mtd/chips/Kconfig b/drivers/mtd/chips/Kconfig

index c219e3d098d9d1572584fed1613aa52f627e9014..e4696b37f3deca35ccca51be33c8ff3181606c58 100644 (file)
--- a/drivers/mtd/chips/Kconfig
+++ b/drivers/mtd/chips/Kconfig
@@ -146,7 +146,6 @@ config MTD_CFI_I8
  config MTD_OTP
         bool "Protection Registers aka one-time programmable (OTP) bits"
         depends on MTD_CFI_ADV_OPTIONS
-       select HAVE_MTD_OTP
         default n
         help
           This enables support for reading, writing and locking so called
diff --git a/drivers/mtd/devices/Kconfig b/drivers/mtd/devices/Kconfig

index 12311f506ca15385c8e325cfb511ccff28b3351a..2a4d55e4b3628b7437fb82d7d51a929c92cf79e2 100644 (file)
--- a/drivers/mtd/devices/Kconfig
+++ b/drivers/mtd/devices/Kconfig
@@ -71,7 +71,6 @@ config MTD_DATAFLASH_WRITE_VERIFY
  config MTD_DATAFLASH_OTP
         bool "DataFlash OTP support (Security Register)"
         depends on MTD_DATAFLASH
-       select HAVE_MTD_OTP
         help
           Newer DataFlash chips (revisions C and D) support 128 bytes of
           one-time-programmable (OTP) data.  The first half may be written
@@ -205,69 +204,6 @@ config MTD_BLOCK2MTD
  
  comment "Disk-On-Chip Device Drivers"
  
-config MTD_DOC2000
-       tristate "M-Systems Disk-On-Chip 2000 and Millennium (DEPRECATED)"
-       depends on MTD_NAND
-       select MTD_DOCPROBE
-       select MTD_NAND_IDS
-       ---help---
-         This provides an MTD device driver for the M-Systems DiskOnChip
-         2000 and Millennium devices.  Originally designed for the DiskOnChip
-         2000, it also now includes support for the DiskOnChip Millennium.
-         If you have problems with this driver and the DiskOnChip Millennium,
-         you may wish to try the alternative Millennium driver below. To use
-         the alternative driver, you will need to undefine DOC_SINGLE_DRIVER
-         in the <file:drivers/mtd/devices/docprobe.c> source code.
-
-         If you use this device, you probably also want to enable the NFTL
-         'NAND Flash Translation Layer' option below, which is used to
-         emulate a block device by using a kind of file system on the flash
-         chips.
-
-         NOTE: This driver is deprecated and will probably be removed soon.
-         Please try the new DiskOnChip driver under "NAND Flash Device
-         Drivers".
-
-config MTD_DOC2001
-       tristate "M-Systems Disk-On-Chip Millennium-only alternative driver (DEPRECATED)"
-       depends on MTD_NAND
-       select MTD_DOCPROBE
-       select MTD_NAND_IDS
-       ---help---
-         This provides an alternative MTD device driver for the M-Systems
-         DiskOnChip Millennium devices.  Use this if you have problems with
-         the combined DiskOnChip 2000 and Millennium driver above.  To get
-         the DiskOnChip probe code to load and use this driver instead of
-         the other one, you will need to undefine DOC_SINGLE_DRIVER near
-         the beginning of <file:drivers/mtd/devices/docprobe.c>.
-
-         If you use this device, you probably also want to enable the NFTL
-         'NAND Flash Translation Layer' option below, which is used to
-         emulate a block device by using a kind of file system on the flash
-         chips.
-
-         NOTE: This driver is deprecated and will probably be removed soon.
-         Please try the new DiskOnChip driver under "NAND Flash Device
-         Drivers".
-
-config MTD_DOC2001PLUS
-       tristate "M-Systems Disk-On-Chip Millennium Plus"
-       depends on MTD_NAND
-       select MTD_DOCPROBE
-       select MTD_NAND_IDS
-       ---help---
-         This provides an MTD device driver for the M-Systems DiskOnChip
-         Millennium Plus devices.
-
-         If you use this device, you probably also want to enable the INFTL
-         'Inverse NAND Flash Translation Layer' option below, which is used
-         to emulate a block device by using a kind of file system on the
-         flash chips.
-
-         NOTE: This driver will soon be replaced by the new DiskOnChip driver
-         under "NAND Flash Device Drivers" (currently that driver does not
-         support all Millennium Plus devices).
-
  config MTD_DOCG3
         tristate "M-Systems Disk-On-Chip G3"
         select BCH
diff --git a/drivers/mtd/devices/Makefile b/drivers/mtd/devices/Makefile

index 369a1943ca259e6106caafeb43f04a4ca1f862fb..d83bd73096f67d916525371a5f51a3dec933877f 100644 (file)
--- a/drivers/mtd/devices/Makefile
+++ b/drivers/mtd/devices/Makefile
@@ -2,12 +2,7 @@
  # linux/drivers/mtd/devices/Makefile
  #
  
-obj-$(CONFIG_MTD_DOC2000)      += doc2000.o
-obj-$(CONFIG_MTD_DOC2001)      += doc2001.o
-obj-$(CONFIG_MTD_DOC2001PLUS)  += doc2001plus.o
  obj-$(CONFIG_MTD_DOCG3)                += docg3.o
-obj-$(CONFIG_MTD_DOCPROBE)     += docprobe.o
-obj-$(CONFIG_MTD_DOCECC)       += docecc.o
  obj-$(CONFIG_MTD_SLRAM)                += slram.o
  obj-$(CONFIG_MTD_PHRAM)                += phram.o
  obj-$(CONFIG_MTD_PMC551)       += pmc551.o
diff --git a/drivers/mtd/devices/bcm47xxsflash.c b/drivers/mtd/devices/bcm47xxsflash.c

index 95266285acb16554fdfef090812cc6840b940d1a..18e7761137a33037a21aa61585e20d98f47b172e 100644 (file)
--- a/drivers/mtd/devices/bcm47xxsflash.c
+++ b/drivers/mtd/devices/bcm47xxsflash.c
@@ -10,7 +10,7 @@
  MODULE_LICENSE("GPL");
  MODULE_DESCRIPTION("Serial flash driver for BCMA bus");
  
-static const char *probes[] = { "bcm47xxpart", NULL };
+static const char * const probes[] = { "bcm47xxpart", NULL };
  
  static int bcm47xxsflash_read(struct mtd_info *mtd, loff_t from, size_t len,
                               size_t *retlen, u_char *buf)
@@ -61,6 +61,17 @@ static int bcm47xxsflash_bcma_probe(struct platform_device *pdev)
         }
         sflash->priv = b47s;
  
+       b47s->bcma_cc = container_of(sflash, struct bcma_drv_cc, sflash);
+
+       switch (b47s->bcma_cc->capabilities & BCMA_CC_CAP_FLASHT) {
+       case BCMA_CC_FLASHT_STSER:
+               b47s->type = BCM47XXSFLASH_TYPE_ST;
+               break;
+       case BCMA_CC_FLASHT_ATSER:
+               b47s->type = BCM47XXSFLASH_TYPE_ATMEL;
+               break;
+       }
+
         b47s->window = sflash->window;
         b47s->blocksize = sflash->blocksize;
         b47s->numblocks = sflash->numblocks;
diff --git a/drivers/mtd/devices/bcm47xxsflash.h b/drivers/mtd/devices/bcm47xxsflash.h

index ebf6f710e23c0ada24b3de883db29acf0be6c7ec..f22f8c46dfc059566ae8cd4fcf4c90f810a7fd2a 100644 (file)
--- a/drivers/mtd/devices/bcm47xxsflash.h
+++ b/drivers/mtd/devices/bcm47xxsflash.h
@@ -3,7 +3,66 @@
  
  #include <linux/mtd/mtd.h>
  
+/* Used for ST flashes only. */
+#define OPCODE_ST_WREN         0x0006          /* Write Enable */
+#define OPCODE_ST_WRDIS                0x0004          /* Write Disable */
+#define OPCODE_ST_RDSR         0x0105          /* Read Status Register */
+#define OPCODE_ST_WRSR         0x0101          /* Write Status Register */
+#define OPCODE_ST_READ         0x0303          /* Read Data Bytes */
+#define OPCODE_ST_PP           0x0302          /* Page Program */
+#define OPCODE_ST_SE           0x02d8          /* Sector Erase */
+#define OPCODE_ST_BE           0x00c7          /* Bulk Erase */
+#define OPCODE_ST_DP           0x00b9          /* Deep Power-down */
+#define OPCODE_ST_RES          0x03ab          /* Read Electronic Signature */
+#define OPCODE_ST_CSA          0x1000          /* Keep chip select asserted */
+#define OPCODE_ST_SSE          0x0220          /* Sub-sector Erase */
+
+/* Used for Atmel flashes only. */
+#define OPCODE_AT_READ                         0x07e8
+#define OPCODE_AT_PAGE_READ                    0x07d2
+#define OPCODE_AT_STATUS                       0x01d7
+#define OPCODE_AT_BUF1_WRITE                   0x0384
+#define OPCODE_AT_BUF2_WRITE                   0x0387
+#define OPCODE_AT_BUF1_ERASE_PROGRAM           0x0283
+#define OPCODE_AT_BUF2_ERASE_PROGRAM           0x0286
+#define OPCODE_AT_BUF1_PROGRAM                 0x0288
+#define OPCODE_AT_BUF2_PROGRAM                 0x0289
+#define OPCODE_AT_PAGE_ERASE                   0x0281
+#define OPCODE_AT_BLOCK_ERASE                  0x0250
+#define OPCODE_AT_BUF1_WRITE_ERASE_PROGRAM     0x0382
+#define OPCODE_AT_BUF2_WRITE_ERASE_PROGRAM     0x0385
+#define OPCODE_AT_BUF1_LOAD                    0x0253
+#define OPCODE_AT_BUF2_LOAD                    0x0255
+#define OPCODE_AT_BUF1_COMPARE                 0x0260
+#define OPCODE_AT_BUF2_COMPARE                 0x0261
+#define OPCODE_AT_BUF1_REPROGRAM               0x0258
+#define OPCODE_AT_BUF2_REPROGRAM               0x0259
+
+/* Status register bits for ST flashes */
+#define SR_ST_WIP              0x01            /* Write In Progress */
+#define SR_ST_WEL              0x02            /* Write Enable Latch */
+#define SR_ST_BP_MASK          0x1c            /* Block Protect */
+#define SR_ST_BP_SHIFT         2
+#define SR_ST_SRWD             0x80            /* Status Register Write Disable */
+
+/* Status register bits for Atmel flashes */
+#define SR_AT_READY            0x80
+#define SR_AT_MISMATCH         0x40
+#define SR_AT_ID_MASK          0x38
+#define SR_AT_ID_SHIFT         3
+
+struct bcma_drv_cc;
+
+enum bcm47xxsflash_type {
+       BCM47XXSFLASH_TYPE_ATMEL,
+       BCM47XXSFLASH_TYPE_ST,
+};
+
  struct bcm47xxsflash {
+       struct bcma_drv_cc *bcma_cc;
+
+       enum bcm47xxsflash_type type;
+
         u32 window;
         u32 blocksize;
         u16 numblocks;
diff --git a/drivers/mtd/devices/doc2000.c b/drivers/mtd/devices/doc2000.c

deleted file mode 100644 (file)

index a4eb8b5..0000000
--- a/drivers/mtd/devices/doc2000.c
+++ /dev/null
@@ -1,1178 +0,0 @@
-
-/*
- * Linux driver for Disk-On-Chip 2000 and Millennium
- * (c) 1999 Machine Vision Holdings, Inc.
- * (c) 1999, 2000 David Woodhouse <dwmw2@infradead.org>
- */
-
-#include <linux/kernel.h>
-#include <linux/module.h>
-#include <asm/errno.h>
-#include <asm/io.h>
-#include <asm/uaccess.h>
-#include <linux/delay.h>
-#include <linux/slab.h>
-#include <linux/sched.h>
-#include <linux/init.h>
-#include <linux/types.h>
-#include <linux/bitops.h>
-#include <linux/mutex.h>
-
-#include <linux/mtd/mtd.h>
-#include <linux/mtd/nand.h>
-#include <linux/mtd/doc2000.h>
-
-#define DOC_SUPPORT_2000
-#define DOC_SUPPORT_2000TSOP
-#define DOC_SUPPORT_MILLENNIUM
-
-#ifdef DOC_SUPPORT_2000
-#define DoC_is_2000(doc) (doc->ChipID == DOC_ChipID_Doc2k)
-#else
-#define DoC_is_2000(doc) (0)
-#endif
-
-#if defined(DOC_SUPPORT_2000TSOP) || defined(DOC_SUPPORT_MILLENNIUM)
-#define DoC_is_Millennium(doc) (doc->ChipID == DOC_ChipID_DocMil)
-#else
-#define DoC_is_Millennium(doc) (0)
-#endif
-
-/* #define ECC_DEBUG */
-
-/* I have no idea why some DoC chips can not use memcpy_from|to_io().
- * This may be due to the different revisions of the ASIC controller built-in or
- * simplily a QA/Bug issue. Who knows ?? If you have trouble, please uncomment
- * this:
- #undef USE_MEMCPY
-*/
-
-static int doc_read(struct mtd_info *mtd, loff_t from, size_t len,
-                   size_t *retlen, u_char *buf);
-static int doc_write(struct mtd_info *mtd, loff_t to, size_t len,
-                    size_t *retlen, const u_char *buf);
-static int doc_read_oob(struct mtd_info *mtd, loff_t ofs,
-                       struct mtd_oob_ops *ops);
-static int doc_write_oob(struct mtd_info *mtd, loff_t ofs,
-                        struct mtd_oob_ops *ops);
-static int doc_write_oob_nolock(struct mtd_info *mtd, loff_t ofs, size_t len,
-                        size_t *retlen, const u_char *buf);
-static int doc_erase (struct mtd_info *mtd, struct erase_info *instr);
-
-static struct mtd_info *doc2klist = NULL;
-
-/* Perform the required delay cycles by reading from the appropriate register */
-static void DoC_Delay(struct DiskOnChip *doc, unsigned short cycles)
-{
-       volatile char dummy;
-       int i;
-
-       for (i = 0; i < cycles; i++) {
-               if (DoC_is_Millennium(doc))
-                       dummy = ReadDOC(doc->virtadr, NOP);
-               else
-                       dummy = ReadDOC(doc->virtadr, DOCStatus);
-       }
-
-}
-
-/* DOC_WaitReady: Wait for RDY line to be asserted by the flash chip */
-static int _DoC_WaitReady(struct DiskOnChip *doc)
-{
-       void __iomem *docptr = doc->virtadr;
-       unsigned long timeo = jiffies + (HZ * 10);
-
-       pr_debug("_DoC_WaitReady called for out-of-line wait\n");
-
-       /* Out-of-line routine to wait for chip response */
-       while (!(ReadDOC(docptr, CDSNControl) & CDSN_CTRL_FR_B)) {
-               /* issue 2 read from NOP register after reading from CDSNControl register
-               see Software Requirement 11.4 item 2. */
-               DoC_Delay(doc, 2);
-
-               if (time_after(jiffies, timeo)) {
-                       pr_debug("_DoC_WaitReady timed out.\n");
-                       return -EIO;
-               }
-               udelay(1);
-               cond_resched();
-       }
-
-       return 0;
-}
-
-static inline int DoC_WaitReady(struct DiskOnChip *doc)
-{
-       void __iomem *docptr = doc->virtadr;
-
-       /* This is inline, to optimise the common case, where it's ready instantly */
-       int ret = 0;
-
-       /* 4 read form NOP register should be issued in prior to the read from CDSNControl
-          see Software Requirement 11.4 item 2. */
-       DoC_Delay(doc, 4);
-
-       if (!(ReadDOC(docptr, CDSNControl) & CDSN_CTRL_FR_B))
-               /* Call the out-of-line routine to wait */
-               ret = _DoC_WaitReady(doc);
-
-       /* issue 2 read from NOP register after reading from CDSNControl register
-          see Software Requirement 11.4 item 2. */
-       DoC_Delay(doc, 2);
-
-       return ret;
-}
-
-/* DoC_Command: Send a flash command to the flash chip through the CDSN Slow IO register to
-   bypass the internal pipeline. Each of 4 delay cycles (read from the NOP register) is
-   required after writing to CDSN Control register, see Software Requirement 11.4 item 3. */
-
-static int DoC_Command(struct DiskOnChip *doc, unsigned char command,
-                             unsigned char xtraflags)
-{
-       void __iomem *docptr = doc->virtadr;
-
-       if (DoC_is_2000(doc))
-               xtraflags |= CDSN_CTRL_FLASH_IO;
-
-       /* Assert the CLE (Command Latch Enable) line to the flash chip */
-       WriteDOC(xtraflags | CDSN_CTRL_CLE | CDSN_CTRL_CE, docptr, CDSNControl);
-       DoC_Delay(doc, 4);      /* Software requirement 11.4.3 for Millennium */
-
-       if (DoC_is_Millennium(doc))
-               WriteDOC(command, docptr, CDSNSlowIO);
-
-       /* Send the command */
-       WriteDOC_(command, docptr, doc->ioreg);
-       if (DoC_is_Millennium(doc))
-               WriteDOC(command, docptr, WritePipeTerm);
-
-       /* Lower the CLE line */
-       WriteDOC(xtraflags | CDSN_CTRL_CE, docptr, CDSNControl);
-       DoC_Delay(doc, 4);      /* Software requirement 11.4.3 for Millennium */
-
-       /* Wait for the chip to respond - Software requirement 11.4.1 (extended for any command) */
-       return DoC_WaitReady(doc);
-}
-
-/* DoC_Address: Set the current address for the flash chip through the CDSN Slow IO register to
-   bypass the internal pipeline. Each of 4 delay cycles (read from the NOP register) is
-   required after writing to CDSN Control register, see Software Requirement 11.4 item 3. */
-
-static int DoC_Address(struct DiskOnChip *doc, int numbytes, unsigned long ofs,
-                      unsigned char xtraflags1, unsigned char xtraflags2)
-{
-       int i;
-       void __iomem *docptr = doc->virtadr;
-
-       if (DoC_is_2000(doc))
-               xtraflags1 |= CDSN_CTRL_FLASH_IO;
-
-       /* Assert the ALE (Address Latch Enable) line to the flash chip */
-       WriteDOC(xtraflags1 | CDSN_CTRL_ALE | CDSN_CTRL_CE, docptr, CDSNControl);
-
-       DoC_Delay(doc, 4);      /* Software requirement 11.4.3 for Millennium */
-
-       /* Send the address */
-       /* Devices with 256-byte page are addressed as:
-          Column (bits 0-7), Page (bits 8-15, 16-23, 24-31)
-          * there is no device on the market with page256
-          and more than 24 bits.
-          Devices with 512-byte page are addressed as:
-          Column (bits 0-7), Page (bits 9-16, 17-24, 25-31)
-          * 25-31 is sent only if the chip support it.
-          * bit 8 changes the read command to be sent
-          (NAND_CMD_READ0 or NAND_CMD_READ1).
-        */
-
-       if (numbytes == ADDR_COLUMN || numbytes == ADDR_COLUMN_PAGE) {
-               if (DoC_is_Millennium(doc))
-                       WriteDOC(ofs & 0xff, docptr, CDSNSlowIO);
-               WriteDOC_(ofs & 0xff, docptr, doc->ioreg);
-       }
-
-       if (doc->page256) {
-               ofs = ofs >> 8;
-       } else {
-               ofs = ofs >> 9;
-       }
-
-       if (numbytes == ADDR_PAGE || numbytes == ADDR_COLUMN_PAGE) {
-               for (i = 0; i < doc->pageadrlen; i++, ofs = ofs >> 8) {
-                       if (DoC_is_Millennium(doc))
-                               WriteDOC(ofs & 0xff, docptr, CDSNSlowIO);
-                       WriteDOC_(ofs & 0xff, docptr, doc->ioreg);
-               }
-       }
-
-       if (DoC_is_Millennium(doc))
-               WriteDOC(ofs & 0xff, docptr, WritePipeTerm);
-
-       DoC_Delay(doc, 2);      /* Needed for some slow flash chips. mf. */
-
-       /* FIXME: The SlowIO's for millennium could be replaced by
-          a single WritePipeTerm here. mf. */
-
-       /* Lower the ALE line */
-       WriteDOC(xtraflags1 | xtraflags2 | CDSN_CTRL_CE, docptr,
-                CDSNControl);
-
-       DoC_Delay(doc, 4);      /* Software requirement 11.4.3 for Millennium */
-
-       /* Wait for the chip to respond - Software requirement 11.4.1 */
-       return DoC_WaitReady(doc);
-}
-
-/* Read a buffer from DoC, taking care of Millennium odditys */
-static void DoC_ReadBuf(struct DiskOnChip *doc, u_char * buf, int len)
-{
-       volatile int dummy;
-       int modulus = 0xffff;
-       void __iomem *docptr = doc->virtadr;
-       int i;
-
-       if (len <= 0)
-               return;
-
-       if (DoC_is_Millennium(doc)) {
-               /* Read the data via the internal pipeline through CDSN IO register,
-                  see Pipelined Read Operations 11.3 */
-               dummy = ReadDOC(docptr, ReadPipeInit);
-
-               /* Millennium should use the LastDataRead register - Pipeline Reads */
-               len--;
-
-               /* This is needed for correctly ECC calculation */
-               modulus = 0xff;
-       }
-
-       for (i = 0; i < len; i++)
-               buf[i] = ReadDOC_(docptr, doc->ioreg + (i & modulus));
-
-       if (DoC_is_Millennium(doc)) {
-               buf[i] = ReadDOC(docptr, LastDataRead);
-       }
-}
-
-/* Write a buffer to DoC, taking care of Millennium odditys */
-static void DoC_WriteBuf(struct DiskOnChip *doc, const u_char * buf, int len)
-{
-       void __iomem *docptr = doc->virtadr;
-       int i;
-
-       if (len <= 0)
-               return;
-
-       for (i = 0; i < len; i++)
-               WriteDOC_(buf[i], docptr, doc->ioreg + i);
-
-       if (DoC_is_Millennium(doc)) {
-               WriteDOC(0x00, docptr, WritePipeTerm);
-       }
-}
-
-
-/* DoC_SelectChip: Select a given flash chip within the current floor */
-
-static inline int DoC_SelectChip(struct DiskOnChip *doc, int chip)
-{
-       void __iomem *docptr = doc->virtadr;
-
-       /* Software requirement 11.4.4 before writing DeviceSelect */
-       /* Deassert the CE line to eliminate glitches on the FCE# outputs */
-       WriteDOC(CDSN_CTRL_WP, docptr, CDSNControl);
-       DoC_Delay(doc, 4);      /* Software requirement 11.4.3 for Millennium */
-
-       /* Select the individual flash chip requested */
-       WriteDOC(chip, docptr, CDSNDeviceSelect);
-       DoC_Delay(doc, 4);
-
-       /* Reassert the CE line */
-       WriteDOC(CDSN_CTRL_CE | CDSN_CTRL_FLASH_IO | CDSN_CTRL_WP, docptr,
-                CDSNControl);
-       DoC_Delay(doc, 4);      /* Software requirement 11.4.3 for Millennium */
-
-       /* Wait for it to be ready */
-       return DoC_WaitReady(doc);
-}
-
-/* DoC_SelectFloor: Select a given floor (bank of flash chips) */
-
-static inline int DoC_SelectFloor(struct DiskOnChip *doc, int floor)
-{
-       void __iomem *docptr = doc->virtadr;
-
-       /* Select the floor (bank) of chips required */
-       WriteDOC(floor, docptr, FloorSelect);
-
-       /* Wait for the chip to be ready */
-       return DoC_WaitReady(doc);
-}
-
-/* DoC_IdentChip: Identify a given NAND chip given {floor,chip} */
-
-static int DoC_IdentChip(struct DiskOnChip *doc, int floor, int chip)
-{
-       int mfr, id, i, j;
-       volatile char dummy;
-
-       /* Page in the required floor/chip */
-       DoC_SelectFloor(doc, floor);
-       DoC_SelectChip(doc, chip);
-
-       /* Reset the chip */
-       if (DoC_Command(doc, NAND_CMD_RESET, CDSN_CTRL_WP)) {
-               pr_debug("DoC_Command (reset) for %d,%d returned true\n",
-                     floor, chip);
-               return 0;
-       }
-
-
-       /* Read the NAND chip ID: 1. Send ReadID command */
-       if (DoC_Command(doc, NAND_CMD_READID, CDSN_CTRL_WP)) {
-               pr_debug("DoC_Command (ReadID) for %d,%d returned true\n",
-                     floor, chip);
-               return 0;
-       }
-
-       /* Read the NAND chip ID: 2. Send address byte zero */
-       DoC_Address(doc, ADDR_COLUMN, 0, CDSN_CTRL_WP, 0);
-
-       /* Read the manufacturer and device id codes from the device */
-
-       if (DoC_is_Millennium(doc)) {
-               DoC_Delay(doc, 2);
-               dummy = ReadDOC(doc->virtadr, ReadPipeInit);
-               mfr = ReadDOC(doc->virtadr, LastDataRead);
-
-               DoC_Delay(doc, 2);
-               dummy = ReadDOC(doc->virtadr, ReadPipeInit);
-               id = ReadDOC(doc->virtadr, LastDataRead);
-       } else {
-               /* CDSN Slow IO register see Software Req 11.4 item 5. */
-               dummy = ReadDOC(doc->virtadr, CDSNSlowIO);
-               DoC_Delay(doc, 2);
-               mfr = ReadDOC_(doc->virtadr, doc->ioreg);
-
-               /* CDSN Slow IO register see Software Req 11.4 item 5. */
-               dummy = ReadDOC(doc->virtadr, CDSNSlowIO);
-               DoC_Delay(doc, 2);
-               id = ReadDOC_(doc->virtadr, doc->ioreg);
-       }
-
-       /* No response - return failure */
-       if (mfr == 0xff || mfr == 0)
-               return 0;
-
-       /* Check it's the same as the first chip we identified.
-        * M-Systems say that any given DiskOnChip device should only
-        * contain _one_ type of flash part, although that's not a
-        * hardware restriction. */
-       if (doc->mfr) {
-               if (doc->mfr == mfr && doc->id == id)
-                       return 1;       /* This is the same as the first */
-               else
-                       printk(KERN_WARNING
-                              "Flash chip at floor %d, chip %d is different:\n",
-                              floor, chip);
-       }
-
-       /* Print and store the manufacturer and ID codes. */
-       for (i = 0; nand_flash_ids[i].name != NULL; i++) {
-               if (id == nand_flash_ids[i].id) {
-                       /* Try to identify manufacturer */
-                       for (j = 0; nand_manuf_ids[j].id != 0x0; j++) {
-                               if (nand_manuf_ids[j].id == mfr)
-                                       break;
-                       }
-                       printk(KERN_INFO
-                              "Flash chip found: Manufacturer ID: %2.2X, "
-                              "Chip ID: %2.2X (%s:%s)\n", mfr, id,
-                              nand_manuf_ids[j].name, nand_flash_ids[i].name);
-                       if (!doc->mfr) {
-                               doc->mfr = mfr;
-                               doc->id = id;
-                               doc->chipshift =
-                                       ffs((nand_flash_ids[i].chipsize << 20)) - 1;
-                               doc->page256 = (nand_flash_ids[i].pagesize == 256) ? 1 : 0;
-                               doc->pageadrlen = doc->chipshift > 25 ? 3 : 2;
-                               doc->erasesize =
-                                   nand_flash_ids[i].erasesize;
-                               return 1;
-                       }
-                       return 0;
-               }
-       }
-
-
-       /* We haven't fully identified the chip. Print as much as we know. */
-       printk(KERN_WARNING "Unknown flash chip found: %2.2X %2.2X\n",
-              id, mfr);
-
-       printk(KERN_WARNING "Please report to dwmw2@infradead.org\n");
-       return 0;
-}
-
-/* DoC_ScanChips: Find all NAND chips present in a DiskOnChip, and identify them */
-
-static void DoC_ScanChips(struct DiskOnChip *this, int maxchips)
-{
-       int floor, chip;
-       int numchips[MAX_FLOORS];
-       int ret = 1;
-
-       this->numchips = 0;
-       this->mfr = 0;
-       this->id = 0;
-
-       /* For each floor, find the number of valid chips it contains */
-       for (floor = 0; floor < MAX_FLOORS; floor++) {
-               ret = 1;
-               numchips[floor] = 0;
-               for (chip = 0; chip < maxchips && ret != 0; chip++) {
-
-                       ret = DoC_IdentChip(this, floor, chip);
-                       if (ret) {
-                               numchips[floor]++;
-                               this->numchips++;
-                       }
-               }
-       }
-
-       /* If there are none at all that we recognise, bail */
-       if (!this->numchips) {
-               printk(KERN_NOTICE "No flash chips recognised.\n");
-               return;
-       }
-
-       /* Allocate an array to hold the information for each chip */
-       this->chips = kmalloc(sizeof(struct Nand) * this->numchips, GFP_KERNEL);
-       if (!this->chips) {
-               printk(KERN_NOTICE "No memory for allocating chip info structures\n");
-               return;
-       }
-
-       ret = 0;
-
-       /* Fill out the chip array with {floor, chipno} for each
-        * detected chip in the device. */
-       for (floor = 0; floor < MAX_FLOORS; floor++) {
-               for (chip = 0; chip < numchips[floor]; chip++) {
-                       this->chips[ret].floor = floor;
-                       this->chips[ret].chip = chip;
-                       this->chips[ret].curadr = 0;
-                       this->chips[ret].curmode = 0x50;
-                       ret++;
-               }
-       }
-
-       /* Calculate and print the total size of the device */
-       this->totlen = this->numchips * (1 << this->chipshift);
-
-       printk(KERN_INFO "%d flash chips found. Total DiskOnChip size: %ld MiB\n",
-              this->numchips, this->totlen >> 20);
-}
-
-static int DoC2k_is_alias(struct DiskOnChip *doc1, struct DiskOnChip *doc2)
-{
-       int tmp1, tmp2, retval;
-       if (doc1->physadr == doc2->physadr)
-               return 1;
-
-       /* Use the alias resolution register which was set aside for this
-        * purpose. If it's value is the same on both chips, they might
-        * be the same chip, and we write to one and check for a change in
-        * the other. It's unclear if this register is usuable in the
-        * DoC 2000 (it's in the Millennium docs), but it seems to work. */
-       tmp1 = ReadDOC(doc1->virtadr, AliasResolution);
-       tmp2 = ReadDOC(doc2->virtadr, AliasResolution);
-       if (tmp1 != tmp2)
-               return 0;
-
-       WriteDOC((tmp1 + 1) % 0xff, doc1->virtadr, AliasResolution);
-       tmp2 = ReadDOC(doc2->virtadr, AliasResolution);
-       if (tmp2 == (tmp1 + 1) % 0xff)
-               retval = 1;
-       else
-               retval = 0;
-
-       /* Restore register contents.  May not be necessary, but do it just to
-        * be safe. */
-       WriteDOC(tmp1, doc1->virtadr, AliasResolution);
-
-       return retval;
-}
-
-/* This routine is found from the docprobe code by symbol_get(),
- * which will bump the use count of this module. */
-void DoC2k_init(struct mtd_info *mtd)
-{
-       struct DiskOnChip *this = mtd->priv;
-       struct DiskOnChip *old = NULL;
-       int maxchips;
-
-       /* We must avoid being called twice for the same device. */
-
-       if (doc2klist)
-               old = doc2klist->priv;
-
-       while (old) {
-               if (DoC2k_is_alias(old, this)) {
-                       printk(KERN_NOTICE
-                              "Ignoring DiskOnChip 2000 at 0x%lX - already configured\n",
-                              this->physadr);
-                       iounmap(this->virtadr);
-                       kfree(mtd);
-                       return;
-               }
-               if (old->nextdoc)
-                       old = old->nextdoc->priv;
-               else
-                       old = NULL;
-       }
-
-
-       switch (this->ChipID) {
-       case DOC_ChipID_Doc2kTSOP:
-               mtd->name = "DiskOnChip 2000 TSOP";
-               this->ioreg = DoC_Mil_CDSN_IO;
-               /* Pretend it's a Millennium */
-               this->ChipID = DOC_ChipID_DocMil;
-               maxchips = MAX_CHIPS;
-               break;
-       case DOC_ChipID_Doc2k:
-               mtd->name = "DiskOnChip 2000";
-               this->ioreg = DoC_2k_CDSN_IO;
-               maxchips = MAX_CHIPS;
-               break;
-       case DOC_ChipID_DocMil:
-               mtd->name = "DiskOnChip Millennium";
-               this->ioreg = DoC_Mil_CDSN_IO;
-               maxchips = MAX_CHIPS_MIL;
-               break;
-       default:
-               printk("Unknown ChipID 0x%02x\n", this->ChipID);
-               kfree(mtd);
-               iounmap(this->virtadr);
-               return;
-       }
-
-       printk(KERN_NOTICE "%s found at address 0x%lX\n", mtd->name,
-              this->physadr);
-
-       mtd->type = MTD_NANDFLASH;
-       mtd->flags = MTD_CAP_NANDFLASH;
-       mtd->writebufsize = mtd->writesize = 512;
-       mtd->oobsize = 16;
-       mtd->ecc_strength = 2;
-       mtd->owner = THIS_MODULE;
-       mtd->_erase = doc_erase;
-       mtd->_read = doc_read;
-       mtd->_write = doc_write;
-       mtd->_read_oob = doc_read_oob;
-       mtd->_write_oob = doc_write_oob;
-       this->curfloor = -1;
-       this->curchip = -1;
-       mutex_init(&this->lock);
-
-       /* Ident all the chips present. */
-       DoC_ScanChips(this, maxchips);
-
-       if (!this->totlen) {
-               kfree(mtd);
-               iounmap(this->virtadr);
-       } else {
-               this->nextdoc = doc2klist;
-               doc2klist = mtd;
-               mtd->size = this->totlen;
-               mtd->erasesize = this->erasesize;
-               mtd_device_register(mtd, NULL, 0);
-               return;
-       }
-}
-EXPORT_SYMBOL_GPL(DoC2k_init);
-
-static int doc_read(struct mtd_info *mtd, loff_t from, size_t len,
-                   size_t * retlen, u_char * buf)
-{
-       struct DiskOnChip *this = mtd->priv;
-       void __iomem *docptr = this->virtadr;
-       struct Nand *mychip;
-       unsigned char syndrome[6], eccbuf[6];
-       volatile char dummy;
-       int i, len256 = 0, ret=0;
-       size_t left = len;
-
-       mutex_lock(&this->lock);
-       while (left) {
-               len = left;
-
-               /* Don't allow a single read to cross a 512-byte block boundary */
-               if (from + len > ((from | 0x1ff) + 1))
-                       len = ((from | 0x1ff) + 1) - from;
-
-               /* The ECC will not be calculated correctly if less than 512 is read */
-               if (len != 0x200)
-                       printk(KERN_WARNING
-                              "ECC needs a full sector read (adr: %lx size %lx)\n",
-                              (long) from, (long) len);
-
-               /* printk("DoC_Read (adr: %lx size %lx)\n", (long) from, (long) len); */
-
-
-               /* Find the chip which is to be used and select it */
-               mychip = &this->chips[from >> (this->chipshift)];
-
-               if (this->curfloor != mychip->floor) {
-                       DoC_SelectFloor(this, mychip->floor);
-                       DoC_SelectChip(this, mychip->chip);
-               } else if (this->curchip != mychip->chip) {
-                       DoC_SelectChip(this, mychip->chip);
-               }
-
-               this->curfloor = mychip->floor;
-               this->curchip = mychip->chip;
-
-               DoC_Command(this,
-                           (!this->page256
-                            && (from & 0x100)) ? NAND_CMD_READ1 : NAND_CMD_READ0,
-                           CDSN_CTRL_WP);
-               DoC_Address(this, ADDR_COLUMN_PAGE, from, CDSN_CTRL_WP,
-                           CDSN_CTRL_ECC_IO);
-
-               /* Prime the ECC engine */
-               WriteDOC(DOC_ECC_RESET, docptr, ECCConf);
-               WriteDOC(DOC_ECC_EN, docptr, ECCConf);
-
-               /* treat crossing 256-byte sector for 2M x 8bits devices */
-               if (this->page256 && from + len > (from | 0xff) + 1) {
-                       len256 = (from | 0xff) + 1 - from;
-                       DoC_ReadBuf(this, buf, len256);
-
-                       DoC_Command(this, NAND_CMD_READ0, CDSN_CTRL_WP);
-                       DoC_Address(this, ADDR_COLUMN_PAGE, from + len256,
-                                   CDSN_CTRL_WP, CDSN_CTRL_ECC_IO);
-               }
-
-               DoC_ReadBuf(this, &buf[len256], len - len256);
-
-               /* Let the caller know we completed it */
-               *retlen += len;
-
-               /* Read the ECC data through the DiskOnChip ECC logic */
-               /* Note: this will work even with 2M x 8bit devices as   */
-               /*       they have 8 bytes of OOB per 256 page. mf.      */
-               DoC_ReadBuf(this, eccbuf, 6);
-
-               /* Flush the pipeline */
-               if (DoC_is_Millennium(this)) {
-                       dummy = ReadDOC(docptr, ECCConf);
-                       dummy = ReadDOC(docptr, ECCConf);
-                       i = ReadDOC(docptr, ECCConf);
-               } else {
-                       dummy = ReadDOC(docptr, 2k_ECCStatus);
-                       dummy = ReadDOC(docptr, 2k_ECCStatus);
-                       i = ReadDOC(docptr, 2k_ECCStatus);
-               }
-
-               /* Check the ECC Status */
-               if (i & 0x80) {
-                       int nb_errors;
-                       /* There was an ECC error */
-#ifdef ECC_DEBUG
-                       printk(KERN_ERR "DiskOnChip ECC Error: Read at %lx\n", (long)from);
-#endif
-                       /* Read the ECC syndrome through the DiskOnChip ECC
-                          logic.  These syndrome will be all ZERO when there
-                          is no error */
-                       for (i = 0; i < 6; i++) {
-                               syndrome[i] =
-                                       ReadDOC(docptr, ECCSyndrome0 + i);
-                       }
-                       nb_errors = doc_decode_ecc(buf, syndrome);
-
-#ifdef ECC_DEBUG
-                       printk(KERN_ERR "Errors corrected: %x\n", nb_errors);
-#endif
-                       if (nb_errors < 0) {
-                               /* We return error, but have actually done the
-                                  read. Not that this can be told to
-                                  user-space, via sys_read(), but at least
-                                  MTD-aware stuff can know about it by
-                                  checking *retlen */
-                               ret = -EIO;
-                       }
-               }
-
-#ifdef PSYCHO_DEBUG
-               printk(KERN_DEBUG "ECC DATA at %lxB: %2.2X %2.2X %2.2X %2.2X %2.2X %2.2X\n",
-                      (long)from, eccbuf[0], eccbuf[1], eccbuf[2],
-                      eccbuf[3], eccbuf[4], eccbuf[5]);
-#endif
-
-               /* disable the ECC engine */
-               WriteDOC(DOC_ECC_DIS, docptr , ECCConf);
-
-               /* according to 11.4.1, we need to wait for the busy line
-                * drop if we read to the end of the page.  */
-               if(0 == ((from + len) & 0x1ff))
-               {
-                   DoC_WaitReady(this);
-               }
-
-               from += len;
-               left -= len;
-               buf += len;
-       }
-
-       mutex_unlock(&this->lock);
-
-       return ret;
-}
-
-static int doc_write(struct mtd_info *mtd, loff_t to, size_t len,
-                    size_t * retlen, const u_char * buf)
-{
-       struct DiskOnChip *this = mtd->priv;
-       int di; /* Yes, DI is a hangover from when I was disassembling the binary driver */
-       void __iomem *docptr = this->virtadr;
-       unsigned char eccbuf[6];
-       volatile char dummy;
-       int len256 = 0;
-       struct Nand *mychip;
-       size_t left = len;
-       int status;
-
-       mutex_lock(&this->lock);
-       while (left) {
-               len = left;
-
-               /* Don't allow a single write to cross a 512-byte block boundary */
-               if (to + len > ((to | 0x1ff) + 1))
-                       len = ((to | 0x1ff) + 1) - to;
-
-               /* The ECC will not be calculated correctly if less than 512 is written */
-/* DBB-
-               if (len != 0x200 && eccbuf)
-                       printk(KERN_WARNING
-                              "ECC needs a full sector write (adr: %lx size %lx)\n",
-                              (long) to, (long) len);
-   -DBB */
-
-               /* printk("DoC_Write (adr: %lx size %lx)\n", (long) to, (long) len); */
-
-               /* Find the chip which is to be used and select it */
-               mychip = &this->chips[to >> (this->chipshift)];
-
-               if (this->curfloor != mychip->floor) {
-                       DoC_SelectFloor(this, mychip->floor);
-                       DoC_SelectChip(this, mychip->chip);
-               } else if (this->curchip != mychip->chip) {
-                       DoC_SelectChip(this, mychip->chip);
-               }
-
-               this->curfloor = mychip->floor;
-               this->curchip = mychip->chip;
-
-               /* Set device to main plane of flash */
-               DoC_Command(this, NAND_CMD_RESET, CDSN_CTRL_WP);
-               DoC_Command(this,
-                           (!this->page256
-                            && (to & 0x100)) ? NAND_CMD_READ1 : NAND_CMD_READ0,
-                           CDSN_CTRL_WP);
-
-               DoC_Command(this, NAND_CMD_SEQIN, 0);
-               DoC_Address(this, ADDR_COLUMN_PAGE, to, 0, CDSN_CTRL_ECC_IO);
-
-               /* Prime the ECC engine */
-               WriteDOC(DOC_ECC_RESET, docptr, ECCConf);
-               WriteDOC(DOC_ECC_EN | DOC_ECC_RW, docptr, ECCConf);
-
-               /* treat crossing 256-byte sector for 2M x 8bits devices */
-               if (this->page256 && to + len > (to | 0xff) + 1) {
-                       len256 = (to | 0xff) + 1 - to;
-                       DoC_WriteBuf(this, buf, len256);
-
-                       DoC_Command(this, NAND_CMD_PAGEPROG, 0);
-
-                       DoC_Command(this, NAND_CMD_STATUS, CDSN_CTRL_WP);
-                       /* There's an implicit DoC_WaitReady() in DoC_Command */
-
-                       dummy = ReadDOC(docptr, CDSNSlowIO);
-                       DoC_Delay(this, 2);
-
-                       if (ReadDOC_(docptr, this->ioreg) & 1) {
-                               printk(KERN_ERR "Error programming flash\n");
-                               /* Error in programming */
-                               *retlen = 0;
-                               mutex_unlock(&this->lock);
-                               return -EIO;
-                       }
-
-                       DoC_Command(this, NAND_CMD_SEQIN, 0);
-                       DoC_Address(this, ADDR_COLUMN_PAGE, to + len256, 0,
-                                   CDSN_CTRL_ECC_IO);
-               }
-
-               DoC_WriteBuf(this, &buf[len256], len - len256);
-
-               WriteDOC(CDSN_CTRL_ECC_IO | CDSN_CTRL_CE, docptr, CDSNControl);
-
-               if (DoC_is_Millennium(this)) {
-                       WriteDOC(0, docptr, NOP);
-                       WriteDOC(0, docptr, NOP);
-                       WriteDOC(0, docptr, NOP);
-               } else {
-                       WriteDOC_(0, docptr, this->ioreg);
-                       WriteDOC_(0, docptr, this->ioreg);
-                       WriteDOC_(0, docptr, this->ioreg);
-               }
-
-               WriteDOC(CDSN_CTRL_ECC_IO | CDSN_CTRL_FLASH_IO | CDSN_CTRL_CE, docptr,
-                        CDSNControl);
-
-               /* Read the ECC data through the DiskOnChip ECC logic */
-               for (di = 0; di < 6; di++) {
-                       eccbuf[di] = ReadDOC(docptr, ECCSyndrome0 + di);
-               }
-
-               /* Reset the ECC engine */
-               WriteDOC(DOC_ECC_DIS, docptr, ECCConf);
-
-#ifdef PSYCHO_DEBUG
-               printk
-                       ("OOB data at %lx is %2.2X %2.2X %2.2X %2.2X %2.2X %2.2X\n",
-                        (long) to, eccbuf[0], eccbuf[1], eccbuf[2], eccbuf[3],
-                        eccbuf[4], eccbuf[5]);
-#endif
-               DoC_Command(this, NAND_CMD_PAGEPROG, 0);
-
-               DoC_Command(this, NAND_CMD_STATUS, CDSN_CTRL_WP);
-               /* There's an implicit DoC_WaitReady() in DoC_Command */
-
-               if (DoC_is_Millennium(this)) {
-                       ReadDOC(docptr, ReadPipeInit);
-                       status = ReadDOC(docptr, LastDataRead);
-               } else {
-                       dummy = ReadDOC(docptr, CDSNSlowIO);
-                       DoC_Delay(this, 2);
-                       status = ReadDOC_(docptr, this->ioreg);
-               }
-
-               if (status & 1) {
-                       printk(KERN_ERR "Error programming flash\n");
-                       /* Error in programming */
-                       *retlen = 0;
-                       mutex_unlock(&this->lock);
-                       return -EIO;
-               }
-
-               /* Let the caller know we completed it */
-               *retlen += len;
-
-               {
-                       unsigned char x[8];
-                       size_t dummy;
-                       int ret;
-
-                       /* Write the ECC data to flash */
-                       for (di=0; di<6; di++)
-                               x[di] = eccbuf[di];
-
-                       x[6]=0x55;
-                       x[7]=0x55;
-
-                       ret = doc_write_oob_nolock(mtd, to, 8, &dummy, x);
-                       if (ret) {
-                               mutex_unlock(&this->lock);
-                               return ret;
-                       }
-               }
-
-               to += len;
-               left -= len;
-               buf += len;
-       }
-
-       mutex_unlock(&this->lock);
-       return 0;
-}
-
-static int doc_read_oob(struct mtd_info *mtd, loff_t ofs,
-                       struct mtd_oob_ops *ops)
-{
-       struct DiskOnChip *this = mtd->priv;
-       int len256 = 0, ret;
-       struct Nand *mychip;
-       uint8_t *buf = ops->oobbuf;
-       size_t len = ops->len;
-
-       BUG_ON(ops->mode != MTD_OPS_PLACE_OOB);
-
-       ofs += ops->ooboffs;
-
-       mutex_lock(&this->lock);
-
-       mychip = &this->chips[ofs >> this->chipshift];
-
-       if (this->curfloor != mychip->floor) {
-               DoC_SelectFloor(this, mychip->floor);
-               DoC_SelectChip(this, mychip->chip);
-       } else if (this->curchip != mychip->chip) {
-               DoC_SelectChip(this, mychip->chip);
-       }
-       this->curfloor = mychip->floor;
-       this->curchip = mychip->chip;
-
-       /* update address for 2M x 8bit devices. OOB starts on the second */
-       /* page to maintain compatibility with doc_read_ecc. */
-       if (this->page256) {
-               if (!(ofs & 0x8))
-                       ofs += 0x100;
-               else
-                       ofs -= 0x8;
-       }
-
-       DoC_Command(this, NAND_CMD_READOOB, CDSN_CTRL_WP);
-       DoC_Address(this, ADDR_COLUMN_PAGE, ofs, CDSN_CTRL_WP, 0);
-
-       /* treat crossing 8-byte OOB data for 2M x 8bit devices */
-       /* Note: datasheet says it should automaticaly wrap to the */
-       /*       next OOB block, but it didn't work here. mf.      */
-       if (this->page256 && ofs + len > (ofs | 0x7) + 1) {
-               len256 = (ofs | 0x7) + 1 - ofs;
-               DoC_ReadBuf(this, buf, len256);
-
-               DoC_Command(this, NAND_CMD_READOOB, CDSN_CTRL_WP);
-               DoC_Address(this, ADDR_COLUMN_PAGE, ofs & (~0x1ff),
-                           CDSN_CTRL_WP, 0);
-       }
-
-       DoC_ReadBuf(this, &buf[len256], len - len256);
-
-       ops->retlen = len;
-       /* Reading the full OOB data drops us off of the end of the page,
-         * causing the flash device to go into busy mode, so we need
-         * to wait until ready 11.4.1 and Toshiba TC58256FT docs */
-
-       ret = DoC_WaitReady(this);
-
-       mutex_unlock(&this->lock);
-       return ret;
-
-}
-
-static int doc_write_oob_nolock(struct mtd_info *mtd, loff_t ofs, size_t len,
-                               size_t * retlen, const u_char * buf)
-{
-       struct DiskOnChip *this = mtd->priv;
-       int len256 = 0;
-       void __iomem *docptr = this->virtadr;
-       struct Nand *mychip = &this->chips[ofs >> this->chipshift];
-       volatile int dummy;
-       int status;
-
-       //      printk("doc_write_oob(%lx, %d): %2.2X %2.2X %2.2X %2.2X ... %2.2X %2.2X .. %2.2X %2.2X\n",(long)ofs, len,
-       //   buf[0], buf[1], buf[2], buf[3], buf[8], buf[9], buf[14],buf[15]);
-
-       /* Find the chip which is to be used and select it */
-       if (this->curfloor != mychip->floor) {
-               DoC_SelectFloor(this, mychip->floor);
-               DoC_SelectChip(this, mychip->chip);
-       } else if (this->curchip != mychip->chip) {
-               DoC_SelectChip(this, mychip->chip);
-       }
-       this->curfloor = mychip->floor;
-       this->curchip = mychip->chip;
-
-       /* disable the ECC engine */
-       WriteDOC (DOC_ECC_RESET, docptr, ECCConf);
-       WriteDOC (DOC_ECC_DIS, docptr, ECCConf);
-
-       /* Reset the chip, see Software Requirement 11.4 item 1. */
-       DoC_Command(this, NAND_CMD_RESET, CDSN_CTRL_WP);
-
-       /* issue the Read2 command to set the pointer to the Spare Data Area. */
-       DoC_Command(this, NAND_CMD_READOOB, CDSN_CTRL_WP);
-
-       /* update address for 2M x 8bit devices. OOB starts on the second */
-       /* page to maintain compatibility with doc_read_ecc. */
-       if (this->page256) {
-               if (!(ofs & 0x8))
-                       ofs += 0x100;
-               else
-                       ofs -= 0x8;
-       }
-
-       /* issue the Serial Data In command to initial the Page Program process */
-       DoC_Command(this, NAND_CMD_SEQIN, 0);
-       DoC_Address(this, ADDR_COLUMN_PAGE, ofs, 0, 0);
-
-       /* treat crossing 8-byte OOB data for 2M x 8bit devices */
-       /* Note: datasheet says it should automaticaly wrap to the */
-       /*       next OOB block, but it didn't work here. mf.      */
-       if (this->page256 && ofs + len > (ofs | 0x7) + 1) {
-               len256 = (ofs | 0x7) + 1 - ofs;
-               DoC_WriteBuf(this, buf, len256);
-
-               DoC_Command(this, NAND_CMD_PAGEPROG, 0);
-               DoC_Command(this, NAND_CMD_STATUS, 0);
-               /* DoC_WaitReady() is implicit in DoC_Command */
-
-               if (DoC_is_Millennium(this)) {
-                       ReadDOC(docptr, ReadPipeInit);
-                       status = ReadDOC(docptr, LastDataRead);
-               } else {
-                       dummy = ReadDOC(docptr, CDSNSlowIO);
-                       DoC_Delay(this, 2);
-                       status = ReadDOC_(docptr, this->ioreg);
-               }
-
-               if (status & 1) {
-                       printk(KERN_ERR "Error programming oob data\n");
-                       /* There was an error */
-                       *retlen = 0;
-                       return -EIO;
-               }
-               DoC_Command(this, NAND_CMD_SEQIN, 0);
-               DoC_Address(this, ADDR_COLUMN_PAGE, ofs & (~0x1ff), 0, 0);
-       }
-
-       DoC_WriteBuf(this, &buf[len256], len - len256);
-
-       DoC_Command(this, NAND_CMD_PAGEPROG, 0);
-       DoC_Command(this, NAND_CMD_STATUS, 0);
-       /* DoC_WaitReady() is implicit in DoC_Command */
-
-       if (DoC_is_Millennium(this)) {
-               ReadDOC(docptr, ReadPipeInit);
-               status = ReadDOC(docptr, LastDataRead);
-       } else {
-               dummy = ReadDOC(docptr, CDSNSlowIO);
-               DoC_Delay(this, 2);
-               status = ReadDOC_(docptr, this->ioreg);
-       }
-
-       if (status & 1) {
-               printk(KERN_ERR "Error programming oob data\n");
-               /* There was an error */
-               *retlen = 0;
-               return -EIO;
-       }
-
-       *retlen = len;
-       return 0;
-
-}
-
-static int doc_write_oob(struct mtd_info *mtd, loff_t ofs,
-                        struct mtd_oob_ops *ops)
-{
-       struct DiskOnChip *this = mtd->priv;
-       int ret;
-
-       BUG_ON(ops->mode != MTD_OPS_PLACE_OOB);
-
-       mutex_lock(&this->lock);
-       ret = doc_write_oob_nolock(mtd, ofs + ops->ooboffs, ops->len,
-                                  &ops->retlen, ops->oobbuf);
-
-       mutex_unlock(&this->lock);
-       return ret;
-}
-
-static int doc_erase(struct mtd_info *mtd, struct erase_info *instr)
-{
-       struct DiskOnChip *this = mtd->priv;
-       __u32 ofs = instr->addr;
-       __u32 len = instr->len;
-       volatile int dummy;
-       void __iomem *docptr = this->virtadr;
-       struct Nand *mychip;
-       int status;
-
-       mutex_lock(&this->lock);
-
-       if (ofs & (mtd->erasesize-1) || len & (mtd->erasesize-1)) {
-               mutex_unlock(&this->lock);
-               return -EINVAL;
-       }
-
-       instr->state = MTD_ERASING;
-
-       /* FIXME: Do this in the background. Use timers or schedule_task() */
-       while(len) {
-               mychip = &this->chips[ofs >> this->chipshift];
-
-               if (this->curfloor != mychip->floor) {
-                       DoC_SelectFloor(this, mychip->floor);
-                       DoC_SelectChip(this, mychip->chip);
-               } else if (this->curchip != mychip->chip) {
-                       DoC_SelectChip(this, mychip->chip);
-               }
-               this->curfloor = mychip->floor;
-               this->curchip = mychip->chip;
-
-               DoC_Command(this, NAND_CMD_ERASE1, 0);
-               DoC_Address(this, ADDR_PAGE, ofs, 0, 0);
-               DoC_Command(this, NAND_CMD_ERASE2, 0);
-
-               DoC_Command(this, NAND_CMD_STATUS, CDSN_CTRL_WP);
-
-               if (DoC_is_Millennium(this)) {
-                       ReadDOC(docptr, ReadPipeInit);
-                       status = ReadDOC(docptr, LastDataRead);
-               } else {
-                       dummy = ReadDOC(docptr, CDSNSlowIO);
-                       DoC_Delay(this, 2);
-                       status = ReadDOC_(docptr, this->ioreg);
-               }
-
-               if (status & 1) {
-                       printk(KERN_ERR "Error erasing at 0x%x\n", ofs);
-                       /* There was an error */
-                       instr->state = MTD_ERASE_FAILED;
-                       goto callback;
-               }
-               ofs += mtd->erasesize;
-               len -= mtd->erasesize;
-       }
-       instr->state = MTD_ERASE_DONE;
-
- callback:
-       mtd_erase_callback(instr);
-
-       mutex_unlock(&this->lock);
-       return 0;
-}
-
-
-/****************************************************************************
- *
- * Module stuff
- *
- ****************************************************************************/
-
-static void __exit cleanup_doc2000(void)
-{
-       struct mtd_info *mtd;
-       struct DiskOnChip *this;
-
-       while ((mtd = doc2klist)) {
-               this = mtd->priv;
-               doc2klist = this->nextdoc;
-
-               mtd_device_unregister(mtd);
-
-               iounmap(this->virtadr);
-               kfree(this->chips);
-               kfree(mtd);
-       }
-}
-
-module_exit(cleanup_doc2000);
-
-MODULE_LICENSE("GPL");
-MODULE_AUTHOR("David Woodhouse <dwmw2@infradead.org> et al.");
-MODULE_DESCRIPTION("MTD driver for DiskOnChip 2000 and Millennium");
-
diff --git a/drivers/mtd/devices/doc2001.c b/drivers/mtd/devices/doc2001.c

deleted file mode 100644 (file)

index f692795..0000000
--- a/drivers/mtd/devices/doc2001.c
+++ /dev/null
@@ -1,824 +0,0 @@
-
-/*
- * Linux driver for Disk-On-Chip Millennium
- * (c) 1999 Machine Vision Holdings, Inc.
- * (c) 1999, 2000 David Woodhouse <dwmw2@infradead.org>
- */
-
-#include <linux/kernel.h>
-#include <linux/module.h>
-#include <asm/errno.h>
-#include <asm/io.h>
-#include <asm/uaccess.h>
-#include <linux/delay.h>
-#include <linux/slab.h>
-#include <linux/init.h>
-#include <linux/types.h>
-#include <linux/bitops.h>
-
-#include <linux/mtd/mtd.h>
-#include <linux/mtd/nand.h>
-#include <linux/mtd/doc2000.h>
-
-/* #define ECC_DEBUG */
-
-/* I have no idea why some DoC chips can not use memcop_form|to_io().
- * This may be due to the different revisions of the ASIC controller built-in or
- * simplily a QA/Bug issue. Who knows ?? If you have trouble, please uncomment
- * this:*/
-#undef USE_MEMCPY
-
-static int doc_read(struct mtd_info *mtd, loff_t from, size_t len,
-                   size_t *retlen, u_char *buf);
-static int doc_write(struct mtd_info *mtd, loff_t to, size_t len,
-                    size_t *retlen, const u_char *buf);
-static int doc_read_oob(struct mtd_info *mtd, loff_t ofs,
-                       struct mtd_oob_ops *ops);
-static int doc_write_oob(struct mtd_info *mtd, loff_t ofs,
-                        struct mtd_oob_ops *ops);
-static int doc_erase (struct mtd_info *mtd, struct erase_info *instr);
-
-static struct mtd_info *docmillist = NULL;
-
-/* Perform the required delay cycles by reading from the NOP register */
-static void DoC_Delay(void __iomem * docptr, unsigned short cycles)
-{
-       volatile char dummy;
-       int i;
-
-       for (i = 0; i < cycles; i++)
-               dummy = ReadDOC(docptr, NOP);
-}
-
-/* DOC_WaitReady: Wait for RDY line to be asserted by the flash chip */
-static int _DoC_WaitReady(void __iomem * docptr)
-{
-       unsigned short c = 0xffff;
-
-       pr_debug("_DoC_WaitReady called for out-of-line wait\n");
-
-       /* Out-of-line routine to wait for chip response */
-       while (!(ReadDOC(docptr, CDSNControl) & CDSN_CTRL_FR_B) && --c)
-               ;
-
-       if (c == 0)
-               pr_debug("_DoC_WaitReady timed out.\n");
-
-       return (c == 0);
-}
-
-static inline int DoC_WaitReady(void __iomem * docptr)
-{
-       /* This is inline, to optimise the common case, where it's ready instantly */
-       int ret = 0;
-
-       /* 4 read form NOP register should be issued in prior to the read from CDSNControl
-          see Software Requirement 11.4 item 2. */
-       DoC_Delay(docptr, 4);
-
-       if (!(ReadDOC(docptr, CDSNControl) & CDSN_CTRL_FR_B))
-               /* Call the out-of-line routine to wait */
-               ret = _DoC_WaitReady(docptr);
-
-       /* issue 2 read from NOP register after reading from CDSNControl register
-          see Software Requirement 11.4 item 2. */
-       DoC_Delay(docptr, 2);
-
-       return ret;
-}
-
-/* DoC_Command: Send a flash command to the flash chip through the CDSN IO register
-   with the internal pipeline. Each of 4 delay cycles (read from the NOP register) is
-   required after writing to CDSN Control register, see Software Requirement 11.4 item 3. */
-
-static void DoC_Command(void __iomem * docptr, unsigned char command,
-                              unsigned char xtraflags)
-{
-       /* Assert the CLE (Command Latch Enable) line to the flash chip */
-       WriteDOC(xtraflags | CDSN_CTRL_CLE | CDSN_CTRL_CE, docptr, CDSNControl);
-       DoC_Delay(docptr, 4);
-
-       /* Send the command */
-       WriteDOC(command, docptr, Mil_CDSN_IO);
-       WriteDOC(0x00, docptr, WritePipeTerm);
-
-       /* Lower the CLE line */
-       WriteDOC(xtraflags | CDSN_CTRL_CE, docptr, CDSNControl);
-       DoC_Delay(docptr, 4);
-}
-
-/* DoC_Address: Set the current address for the flash chip through the CDSN IO register
-   with the internal pipeline. Each of 4 delay cycles (read from the NOP register) is
-   required after writing to CDSN Control register, see Software Requirement 11.4 item 3. */
-
-static inline void DoC_Address(void __iomem * docptr, int numbytes, unsigned long ofs,
-                              unsigned char xtraflags1, unsigned char xtraflags2)
-{
-       /* Assert the ALE (Address Latch Enable) line to the flash chip */
-       WriteDOC(xtraflags1 | CDSN_CTRL_ALE | CDSN_CTRL_CE, docptr, CDSNControl);
-       DoC_Delay(docptr, 4);
-
-       /* Send the address */
-       switch (numbytes)
-           {
-           case 1:
-                   /* Send single byte, bits 0-7. */
-                   WriteDOC(ofs & 0xff, docptr, Mil_CDSN_IO);
-                   WriteDOC(0x00, docptr, WritePipeTerm);
-                   break;
-           case 2:
-                   /* Send bits 9-16 followed by 17-23 */
-                   WriteDOC((ofs >> 9)  & 0xff, docptr, Mil_CDSN_IO);
-                   WriteDOC((ofs >> 17) & 0xff, docptr, Mil_CDSN_IO);
-                   WriteDOC(0x00, docptr, WritePipeTerm);
-               break;
-           case 3:
-                   /* Send 0-7, 9-16, then 17-23 */
-                   WriteDOC(ofs & 0xff, docptr, Mil_CDSN_IO);
-                   WriteDOC((ofs >> 9)  & 0xff, docptr, Mil_CDSN_IO);
-                   WriteDOC((ofs >> 17) & 0xff, docptr, Mil_CDSN_IO);
-                   WriteDOC(0x00, docptr, WritePipeTerm);
-               break;
-           default:
-               return;
-           }
-
-       /* Lower the ALE line */
-       WriteDOC(xtraflags1 | xtraflags2 | CDSN_CTRL_CE, docptr, CDSNControl);
-       DoC_Delay(docptr, 4);
-}
-
-/* DoC_SelectChip: Select a given flash chip within the current floor */
-static int DoC_SelectChip(void __iomem * docptr, int chip)
-{
-       /* Select the individual flash chip requested */
-       WriteDOC(chip, docptr, CDSNDeviceSelect);
-       DoC_Delay(docptr, 4);
-
-       /* Wait for it to be ready */
-       return DoC_WaitReady(docptr);
-}
-
-/* DoC_SelectFloor: Select a given floor (bank of flash chips) */
-static int DoC_SelectFloor(void __iomem * docptr, int floor)
-{
-       /* Select the floor (bank) of chips required */
-       WriteDOC(floor, docptr, FloorSelect);
-
-       /* Wait for the chip to be ready */
-       return DoC_WaitReady(docptr);
-}
-
-/* DoC_IdentChip: Identify a given NAND chip given {floor,chip} */
-static int DoC_IdentChip(struct DiskOnChip *doc, int floor, int chip)
-{
-       int mfr, id, i, j;
-       volatile char dummy;
-
-       /* Page in the required floor/chip
-          FIXME: is this supported by Millennium ?? */
-       DoC_SelectFloor(doc->virtadr, floor);
-       DoC_SelectChip(doc->virtadr, chip);
-
-       /* Reset the chip, see Software Requirement 11.4 item 1. */
-       DoC_Command(doc->virtadr, NAND_CMD_RESET, CDSN_CTRL_WP);
-       DoC_WaitReady(doc->virtadr);
-
-       /* Read the NAND chip ID: 1. Send ReadID command */
-       DoC_Command(doc->virtadr, NAND_CMD_READID, CDSN_CTRL_WP);
-
-       /* Read the NAND chip ID: 2. Send address byte zero */
-       DoC_Address(doc->virtadr, 1, 0x00, CDSN_CTRL_WP, 0x00);
-
-       /* Read the manufacturer and device id codes of the flash device through
-          CDSN IO register see Software Requirement 11.4 item 5.*/
-       dummy = ReadDOC(doc->virtadr, ReadPipeInit);
-       DoC_Delay(doc->virtadr, 2);
-       mfr = ReadDOC(doc->virtadr, Mil_CDSN_IO);
-
-       DoC_Delay(doc->virtadr, 2);
-       id  = ReadDOC(doc->virtadr, Mil_CDSN_IO);
-       dummy = ReadDOC(doc->virtadr, LastDataRead);
-
-       /* No response - return failure */
-       if (mfr == 0xff || mfr == 0)
-               return 0;
-
-       /* FIXME: to deal with multi-flash on multi-Millennium case more carefully */
-       for (i = 0; nand_flash_ids[i].name != NULL; i++) {
-               if ( id == nand_flash_ids[i].id) {
-                       /* Try to identify manufacturer */
-                       for (j = 0; nand_manuf_ids[j].id != 0x0; j++) {
-                               if (nand_manuf_ids[j].id == mfr)
-                                       break;
-                       }
-                       printk(KERN_INFO "Flash chip found: Manufacturer ID: %2.2X, "
-                              "Chip ID: %2.2X (%s:%s)\n",
-                              mfr, id, nand_manuf_ids[j].name, nand_flash_ids[i].name);
-                       doc->mfr = mfr;
-                       doc->id = id;
-                       doc->chipshift = ffs((nand_flash_ids[i].chipsize << 20)) - 1;
-                       break;
-               }
-       }
-
-       if (nand_flash_ids[i].name == NULL)
-               return 0;
-       else
-               return 1;
-}
-
-/* DoC_ScanChips: Find all NAND chips present in a DiskOnChip, and identify them */
-static void DoC_ScanChips(struct DiskOnChip *this)
-{
-       int floor, chip;
-       int numchips[MAX_FLOORS_MIL];
-       int ret;
-
-       this->numchips = 0;
-       this->mfr = 0;
-       this->id = 0;
-
-       /* For each floor, find the number of valid chips it contains */
-       for (floor = 0,ret = 1; floor < MAX_FLOORS_MIL; floor++) {
-               numchips[floor] = 0;
-               for (chip = 0; chip < MAX_CHIPS_MIL && ret != 0; chip++) {
-                       ret = DoC_IdentChip(this, floor, chip);
-                       if (ret) {
-                               numchips[floor]++;
-                               this->numchips++;
-                       }
-               }
-       }
-       /* If there are none at all that we recognise, bail */
-       if (!this->numchips) {
-               printk("No flash chips recognised.\n");
-               return;
-       }
-
-       /* Allocate an array to hold the information for each chip */
-       this->chips = kmalloc(sizeof(struct Nand) * this->numchips, GFP_KERNEL);
-       if (!this->chips){
-               printk("No memory for allocating chip info structures\n");
-               return;
-       }
-
-       /* Fill out the chip array with {floor, chipno} for each
-        * detected chip in the device. */
-       for (floor = 0, ret = 0; floor < MAX_FLOORS_MIL; floor++) {
-               for (chip = 0 ; chip < numchips[floor] ; chip++) {
-                       this->chips[ret].floor = floor;
-                       this->chips[ret].chip = chip;
-                       this->chips[ret].curadr = 0;
-                       this->chips[ret].curmode = 0x50;
-                       ret++;
-               }
-       }
-
-       /* Calculate and print the total size of the device */
-       this->totlen = this->numchips * (1 << this->chipshift);
-       printk(KERN_INFO "%d flash chips found. Total DiskOnChip size: %ld MiB\n",
-              this->numchips ,this->totlen >> 20);
-}
-
-static int DoCMil_is_alias(struct DiskOnChip *doc1, struct DiskOnChip *doc2)
-{
-       int tmp1, tmp2, retval;
-
-       if (doc1->physadr == doc2->physadr)
-               return 1;
-
-       /* Use the alias resolution register which was set aside for this
-        * purpose. If it's value is the same on both chips, they might
-        * be the same chip, and we write to one and check for a change in
-        * the other. It's unclear if this register is usuable in the
-        * DoC 2000 (it's in the Millenium docs), but it seems to work. */
-       tmp1 = ReadDOC(doc1->virtadr, AliasResolution);
-       tmp2 = ReadDOC(doc2->virtadr, AliasResolution);
-       if (tmp1 != tmp2)
-               return 0;
-
-       WriteDOC((tmp1+1) % 0xff, doc1->virtadr, AliasResolution);
-       tmp2 = ReadDOC(doc2->virtadr, AliasResolution);
-       if (tmp2 == (tmp1+1) % 0xff)
-               retval = 1;
-       else
-               retval = 0;
-
-       /* Restore register contents.  May not be necessary, but do it just to
-        * be safe. */
-       WriteDOC(tmp1, doc1->virtadr, AliasResolution);
-
-       return retval;
-}
-
-/* This routine is found from the docprobe code by symbol_get(),
- * which will bump the use count of this module. */
-void DoCMil_init(struct mtd_info *mtd)
-{
-       struct DiskOnChip *this = mtd->priv;
-       struct DiskOnChip *old = NULL;
-
-       /* We must avoid being called twice for the same device. */
-       if (docmillist)
-               old = docmillist->priv;
-
-       while (old) {
-               if (DoCMil_is_alias(this, old)) {
-                       printk(KERN_NOTICE "Ignoring DiskOnChip Millennium at "
-                              "0x%lX - already configured\n", this->physadr);
-                       iounmap(this->virtadr);
-                       kfree(mtd);
-                       return;
-               }
-               if (old->nextdoc)
-                       old = old->nextdoc->priv;
-               else
-                       old = NULL;
-       }
-
-       mtd->name = "DiskOnChip Millennium";
-       printk(KERN_NOTICE "DiskOnChip Millennium found at address 0x%lX\n",
-              this->physadr);
-
-       mtd->type = MTD_NANDFLASH;
-       mtd->flags = MTD_CAP_NANDFLASH;
-
-       /* FIXME: erase size is not always 8KiB */
-       mtd->erasesize = 0x2000;
-       mtd->writebufsize = mtd->writesize = 512;
-       mtd->oobsize = 16;
-       mtd->ecc_strength = 2;
-       mtd->owner = THIS_MODULE;
-       mtd->_erase = doc_erase;
-       mtd->_read = doc_read;
-       mtd->_write = doc_write;
-       mtd->_read_oob = doc_read_oob;
-       mtd->_write_oob = doc_write_oob;
-       this->curfloor = -1;
-       this->curchip = -1;
-
-       /* Ident all the chips present. */
-       DoC_ScanChips(this);
-
-       if (!this->totlen) {
-               kfree(mtd);
-               iounmap(this->virtadr);
-       } else {
-               this->nextdoc = docmillist;
-               docmillist = mtd;
-               mtd->size  = this->totlen;
-               mtd_device_register(mtd, NULL, 0);
-               return;
-       }
-}
-EXPORT_SYMBOL_GPL(DoCMil_init);
-
-static int doc_read (struct mtd_info *mtd, loff_t from, size_t len,
-                    size_t *retlen, u_char *buf)
-{
-       int i, ret;
-       volatile char dummy;
-       unsigned char syndrome[6], eccbuf[6];
-       struct DiskOnChip *this = mtd->priv;
-       void __iomem *docptr = this->virtadr;
-       struct Nand *mychip = &this->chips[from >> (this->chipshift)];
-
-       /* Don't allow a single read to cross a 512-byte block boundary */
-       if (from + len > ((from | 0x1ff) + 1))
-               len = ((from | 0x1ff) + 1) - from;
-
-       /* Find the chip which is to be used and select it */
-       if (this->curfloor != mychip->floor) {
-               DoC_SelectFloor(docptr, mychip->floor);
-               DoC_SelectChip(docptr, mychip->chip);
-       } else if (this->curchip != mychip->chip) {
-               DoC_SelectChip(docptr, mychip->chip);
-       }
-       this->curfloor = mychip->floor;
-       this->curchip = mychip->chip;
-
-       /* issue the Read0 or Read1 command depend on which half of the page
-          we are accessing. Polling the Flash Ready bit after issue 3 bytes
-          address in Sequence Read Mode, see Software Requirement 11.4 item 1.*/
-       DoC_Command(docptr, (from >> 8) & 1, CDSN_CTRL_WP);
-       DoC_Address(docptr, 3, from, CDSN_CTRL_WP, 0x00);
-       DoC_WaitReady(docptr);
-
-       /* init the ECC engine, see Reed-Solomon EDC/ECC 11.1 .*/
-       WriteDOC (DOC_ECC_RESET, docptr, ECCConf);
-       WriteDOC (DOC_ECC_EN, docptr, ECCConf);
-
-       /* Read the data via the internal pipeline through CDSN IO register,
-          see Pipelined Read Operations 11.3 */
-       dummy = ReadDOC(docptr, ReadPipeInit);
-#ifndef USE_MEMCPY
-       for (i = 0; i < len-1; i++) {
-               /* N.B. you have to increase the source address in this way or the
-                  ECC logic will not work properly */
-               buf[i] = ReadDOC(docptr, Mil_CDSN_IO + (i & 0xff));
-       }
-#else
-       memcpy_fromio(buf, docptr + DoC_Mil_CDSN_IO, len - 1);
-#endif
-       buf[len - 1] = ReadDOC(docptr, LastDataRead);
-
-       /* Let the caller know we completed it */
-       *retlen = len;
-        ret = 0;
-
-       /* Read the ECC data from Spare Data Area,
-          see Reed-Solomon EDC/ECC 11.1 */
-       dummy = ReadDOC(docptr, ReadPipeInit);
-#ifndef USE_MEMCPY
-       for (i = 0; i < 5; i++) {
-               /* N.B. you have to increase the source address in this way or the
-                  ECC logic will not work properly */
-               eccbuf[i] = ReadDOC(docptr, Mil_CDSN_IO + i);
-       }
-#else
-       memcpy_fromio(eccbuf, docptr + DoC_Mil_CDSN_IO, 5);
-#endif
-       eccbuf[5] = ReadDOC(docptr, LastDataRead);
-
-       /* Flush the pipeline */
-       dummy = ReadDOC(docptr, ECCConf);
-       dummy = ReadDOC(docptr, ECCConf);
-
-       /* Check the ECC Status */
-       if (ReadDOC(docptr, ECCConf) & 0x80) {
-               int nb_errors;
-               /* There was an ECC error */
-#ifdef ECC_DEBUG
-               printk("DiskOnChip ECC Error: Read at %lx\n", (long)from);
-#endif
-               /* Read the ECC syndrome through the DiskOnChip ECC logic.
-                  These syndrome will be all ZERO when there is no error */
-               for (i = 0; i < 6; i++) {
-                       syndrome[i] = ReadDOC(docptr, ECCSyndrome0 + i);
-               }
-               nb_errors = doc_decode_ecc(buf, syndrome);
-#ifdef ECC_DEBUG
-               printk("ECC Errors corrected: %x\n", nb_errors);
-#endif
-               if (nb_errors < 0) {
-                       /* We return error, but have actually done the read. Not that
-                          this can be told to user-space, via sys_read(), but at least
-                          MTD-aware stuff can know about it by checking *retlen */
-                       ret = -EIO;
-               }
-       }
-
-#ifdef PSYCHO_DEBUG
-       printk("ECC DATA at %lx: %2.2X %2.2X %2.2X %2.2X %2.2X %2.2X\n",
-              (long)from, eccbuf[0], eccbuf[1], eccbuf[2], eccbuf[3],
-              eccbuf[4], eccbuf[5]);
-#endif
-
-       /* disable the ECC engine */
-       WriteDOC(DOC_ECC_DIS, docptr , ECCConf);
-
-       return ret;
-}
-
-static int doc_write (struct mtd_info *mtd, loff_t to, size_t len,
-                     size_t *retlen, const u_char *buf)
-{
-       int i,ret = 0;
-       char eccbuf[6];
-       volatile char dummy;
-       struct DiskOnChip *this = mtd->priv;
-       void __iomem *docptr = this->virtadr;
-       struct Nand *mychip = &this->chips[to >> (this->chipshift)];
-
-#if 0
-       /* Don't allow a single write to cross a 512-byte block boundary */
-       if (to + len > ( (to | 0x1ff) + 1))
-               len = ((to | 0x1ff) + 1) - to;
-#else
-       /* Don't allow writes which aren't exactly one block */
-       if (to & 0x1ff || len != 0x200)
-               return -EINVAL;
-#endif
-
-       /* Find the chip which is to be used and select it */
-       if (this->curfloor != mychip->floor) {
-               DoC_SelectFloor(docptr, mychip->floor);
-               DoC_SelectChip(docptr, mychip->chip);
-       } else if (this->curchip != mychip->chip) {
-               DoC_SelectChip(docptr, mychip->chip);
-       }
-       this->curfloor = mychip->floor;
-       this->curchip = mychip->chip;
-
-       /* Reset the chip, see Software Requirement 11.4 item 1. */
-       DoC_Command(docptr, NAND_CMD_RESET, 0x00);
-       DoC_WaitReady(docptr);
-       /* Set device to main plane of flash */
-       DoC_Command(docptr, NAND_CMD_READ0, 0x00);
-
-       /* issue the Serial Data In command to initial the Page Program process */
-       DoC_Command(docptr, NAND_CMD_SEQIN, 0x00);
-       DoC_Address(docptr, 3, to, 0x00, 0x00);
-       DoC_WaitReady(docptr);
-
-       /* init the ECC engine, see Reed-Solomon EDC/ECC 11.1 .*/
-       WriteDOC (DOC_ECC_RESET, docptr, ECCConf);
-       WriteDOC (DOC_ECC_EN | DOC_ECC_RW, docptr, ECCConf);
-
-       /* Write the data via the internal pipeline through CDSN IO register,
-          see Pipelined Write Operations 11.2 */
-#ifndef USE_MEMCPY
-       for (i = 0; i < len; i++) {
-               /* N.B. you have to increase the source address in this way or the
-                  ECC logic will not work properly */
-               WriteDOC(buf[i], docptr, Mil_CDSN_IO + i);
-       }
-#else
-       memcpy_toio(docptr + DoC_Mil_CDSN_IO, buf, len);
-#endif
-       WriteDOC(0x00, docptr, WritePipeTerm);
-
-       /* Write ECC data to flash, the ECC info is generated by the DiskOnChip ECC logic
-          see Reed-Solomon EDC/ECC 11.1 */
-       WriteDOC(0, docptr, NOP);
-       WriteDOC(0, docptr, NOP);
-       WriteDOC(0, docptr, NOP);
-
-       /* Read the ECC data through the DiskOnChip ECC logic */
-       for (i = 0; i < 6; i++) {
-               eccbuf[i] = ReadDOC(docptr, ECCSyndrome0 + i);
-       }
-
-       /* ignore the ECC engine */
-       WriteDOC(DOC_ECC_DIS, docptr , ECCConf);
-
-#ifndef USE_MEMCPY
-       /* Write the ECC data to flash */
-       for (i = 0; i < 6; i++) {
-               /* N.B. you have to increase the source address in this way or the
-                  ECC logic will not work properly */
-               WriteDOC(eccbuf[i], docptr, Mil_CDSN_IO + i);
-       }
-#else
-       memcpy_toio(docptr + DoC_Mil_CDSN_IO, eccbuf, 6);
-#endif
-
-       /* write the block status BLOCK_USED (0x5555) at the end of ECC data
-          FIXME: this is only a hack for programming the IPL area for LinuxBIOS
-          and should be replace with proper codes in user space utilities */
-       WriteDOC(0x55, docptr, Mil_CDSN_IO);
-       WriteDOC(0x55, docptr, Mil_CDSN_IO + 1);
-
-       WriteDOC(0x00, docptr, WritePipeTerm);
-
-#ifdef PSYCHO_DEBUG
-       printk("OOB data at %lx is %2.2X %2.2X %2.2X %2.2X %2.2X %2.2X\n",
-              (long) to, eccbuf[0], eccbuf[1], eccbuf[2], eccbuf[3],
-              eccbuf[4], eccbuf[5]);
-#endif
-
-       /* Commit the Page Program command and wait for ready
-          see Software Requirement 11.4 item 1.*/
-       DoC_Command(docptr, NAND_CMD_PAGEPROG, 0x00);
-       DoC_WaitReady(docptr);
-
-       /* Read the status of the flash device through CDSN IO register
-          see Software Requirement 11.4 item 5.*/
-       DoC_Command(docptr, NAND_CMD_STATUS, CDSN_CTRL_WP);
-       dummy = ReadDOC(docptr, ReadPipeInit);
-       DoC_Delay(docptr, 2);
-       if (ReadDOC(docptr, Mil_CDSN_IO) & 1) {
-               printk("Error programming flash\n");
-               /* Error in programming
-                  FIXME: implement Bad Block Replacement (in nftl.c ??) */
-               ret = -EIO;
-       }
-       dummy = ReadDOC(docptr, LastDataRead);
-
-       /* Let the caller know we completed it */
-       *retlen = len;
-
-       return ret;
-}
-
-static int doc_read_oob(struct mtd_info *mtd, loff_t ofs,
-                       struct mtd_oob_ops *ops)
-{
-#ifndef USE_MEMCPY
-       int i;
-#endif
-       volatile char dummy;
-       struct DiskOnChip *this = mtd->priv;
-       void __iomem *docptr = this->virtadr;
-       struct Nand *mychip = &this->chips[ofs >> this->chipshift];
-       uint8_t *buf = ops->oobbuf;
-       size_t len = ops->len;
-
-       BUG_ON(ops->mode != MTD_OPS_PLACE_OOB);
-
-       ofs += ops->ooboffs;
-
-       /* Find the chip which is to be used and select it */
-       if (this->curfloor != mychip->floor) {
-               DoC_SelectFloor(docptr, mychip->floor);
-               DoC_SelectChip(docptr, mychip->chip);
-       } else if (this->curchip != mychip->chip) {
-               DoC_SelectChip(docptr, mychip->chip);
-       }
-       this->curfloor = mychip->floor;
-       this->curchip = mychip->chip;
-
-       /* disable the ECC engine */
-       WriteDOC (DOC_ECC_RESET, docptr, ECCConf);
-       WriteDOC (DOC_ECC_DIS, docptr, ECCConf);
-
-       /* issue the Read2 command to set the pointer to the Spare Data Area.
-          Polling the Flash Ready bit after issue 3 bytes address in
-          Sequence Read Mode, see Software Requirement 11.4 item 1.*/
-       DoC_Command(docptr, NAND_CMD_READOOB, CDSN_CTRL_WP);
-       DoC_Address(docptr, 3, ofs, CDSN_CTRL_WP, 0x00);
-       DoC_WaitReady(docptr);
-
-       /* Read the data out via the internal pipeline through CDSN IO register,
-          see Pipelined Read Operations 11.3 */
-       dummy = ReadDOC(docptr, ReadPipeInit);
-#ifndef USE_MEMCPY
-       for (i = 0; i < len-1; i++) {
-               /* N.B. you have to increase the source address in this way or the
-                  ECC logic will not work properly */
-               buf[i] = ReadDOC(docptr, Mil_CDSN_IO + i);
-       }
-#else
-       memcpy_fromio(buf, docptr + DoC_Mil_CDSN_IO, len - 1);
-#endif
-       buf[len - 1] = ReadDOC(docptr, LastDataRead);
-
-       ops->retlen = len;
-
-       return 0;
-}
-
-static int doc_write_oob(struct mtd_info *mtd, loff_t ofs,
-                        struct mtd_oob_ops *ops)
-{
-#ifndef USE_MEMCPY
-       int i;
-#endif
-       volatile char dummy;
-       int ret = 0;
-       struct DiskOnChip *this = mtd->priv;
-       void __iomem *docptr = this->virtadr;
-       struct Nand *mychip = &this->chips[ofs >> this->chipshift];
-       uint8_t *buf = ops->oobbuf;
-       size_t len = ops->len;
-
-       BUG_ON(ops->mode != MTD_OPS_PLACE_OOB);
-
-       ofs += ops->ooboffs;
-
-       /* Find the chip which is to be used and select it */
-       if (this->curfloor != mychip->floor) {
-               DoC_SelectFloor(docptr, mychip->floor);
-               DoC_SelectChip(docptr, mychip->chip);
-       } else if (this->curchip != mychip->chip) {
-               DoC_SelectChip(docptr, mychip->chip);
-       }
-       this->curfloor = mychip->floor;
-       this->curchip = mychip->chip;
-
-       /* disable the ECC engine */
-       WriteDOC (DOC_ECC_RESET, docptr, ECCConf);
-       WriteDOC (DOC_ECC_DIS, docptr, ECCConf);
-
-       /* Reset the chip, see Software Requirement 11.4 item 1. */
-       DoC_Command(docptr, NAND_CMD_RESET, CDSN_CTRL_WP);
-       DoC_WaitReady(docptr);
-       /* issue the Read2 command to set the pointer to the Spare Data Area. */
-       DoC_Command(docptr, NAND_CMD_READOOB, CDSN_CTRL_WP);
-
-       /* issue the Serial Data In command to initial the Page Program process */
-       DoC_Command(docptr, NAND_CMD_SEQIN, 0x00);
-       DoC_Address(docptr, 3, ofs, 0x00, 0x00);
-
-       /* Write the data via the internal pipeline through CDSN IO register,
-          see Pipelined Write Operations 11.2 */
-#ifndef USE_MEMCPY
-       for (i = 0; i < len; i++) {
-               /* N.B. you have to increase the source address in this way or the
-                  ECC logic will not work properly */
-               WriteDOC(buf[i], docptr, Mil_CDSN_IO + i);
-       }
-#else
-       memcpy_toio(docptr + DoC_Mil_CDSN_IO, buf, len);
-#endif
-       WriteDOC(0x00, docptr, WritePipeTerm);
-
-       /* Commit the Page Program command and wait for ready
-          see Software Requirement 11.4 item 1.*/
-       DoC_Command(docptr, NAND_CMD_PAGEPROG, 0x00);
-       DoC_WaitReady(docptr);
-
-       /* Read the status of the flash device through CDSN IO register
-          see Software Requirement 11.4 item 5.*/
-       DoC_Command(docptr, NAND_CMD_STATUS, 0x00);
-       dummy = ReadDOC(docptr, ReadPipeInit);
-       DoC_Delay(docptr, 2);
-       if (ReadDOC(docptr, Mil_CDSN_IO) & 1) {
-               printk("Error programming oob data\n");
-               /* FIXME: implement Bad Block Replacement (in nftl.c ??) */
-               ops->retlen = 0;
-               ret = -EIO;
-       }
-       dummy = ReadDOC(docptr, LastDataRead);
-
-       ops->retlen = len;
-
-       return ret;
-}
-
-int doc_erase (struct mtd_info *mtd, struct erase_info *instr)
-{
-       volatile char dummy;
-       struct DiskOnChip *this = mtd->priv;
-       __u32 ofs = instr->addr;
-       __u32 len = instr->len;
-       void __iomem *docptr = this->virtadr;
-       struct Nand *mychip = &this->chips[ofs >> this->chipshift];
-
-       if (len != mtd->erasesize)
-               printk(KERN_WARNING "Erase not right size (%x != %x)n",
-                      len, mtd->erasesize);
-
-       /* Find the chip which is to be used and select it */
-       if (this->curfloor != mychip->floor) {
-               DoC_SelectFloor(docptr, mychip->floor);
-               DoC_SelectChip(docptr, mychip->chip);
-       } else if (this->curchip != mychip->chip) {
-               DoC_SelectChip(docptr, mychip->chip);
-       }
-       this->curfloor = mychip->floor;
-       this->curchip = mychip->chip;
-
-       instr->state = MTD_ERASE_PENDING;
-
-       /* issue the Erase Setup command */
-       DoC_Command(docptr, NAND_CMD_ERASE1, 0x00);
-       DoC_Address(docptr, 2, ofs, 0x00, 0x00);
-
-       /* Commit the Erase Start command and wait for ready
-          see Software Requirement 11.4 item 1.*/
-       DoC_Command(docptr, NAND_CMD_ERASE2, 0x00);
-       DoC_WaitReady(docptr);
-
-       instr->state = MTD_ERASING;
-
-       /* Read the status of the flash device through CDSN IO register
-          see Software Requirement 11.4 item 5.
-          FIXME: it seems that we are not wait long enough, some blocks are not
-          erased fully */
-       DoC_Command(docptr, NAND_CMD_STATUS, CDSN_CTRL_WP);
-       dummy = ReadDOC(docptr, ReadPipeInit);
-       DoC_Delay(docptr, 2);
-       if (ReadDOC(docptr, Mil_CDSN_IO) & 1) {
-               printk("Error Erasing at 0x%x\n", ofs);
-               /* There was an error
-                  FIXME: implement Bad Block Replacement (in nftl.c ??) */
-               instr->state = MTD_ERASE_FAILED;
-       } else
-               instr->state = MTD_ERASE_DONE;
-       dummy = ReadDOC(docptr, LastDataRead);
-
-       mtd_erase_callback(instr);
-
-       return 0;
-}
-
-/****************************************************************************
- *
- * Module stuff
- *
- ****************************************************************************/
-
-static void __exit cleanup_doc2001(void)
-{
-       struct mtd_info *mtd;
-       struct DiskOnChip *this;
-
-       while ((mtd=docmillist)) {
-               this = mtd->priv;
-               docmillist = this->nextdoc;
-
-               mtd_device_unregister(mtd);
-
-               iounmap(this->virtadr);
-               kfree(this->chips);
-               kfree(mtd);
-       }
-}
-
-module_exit(cleanup_doc2001);
-
-MODULE_LICENSE("GPL");
-MODULE_AUTHOR("David Woodhouse <dwmw2@infradead.org> et al.");
-MODULE_DESCRIPTION("Alternative driver for DiskOnChip Millennium");
diff --git a/drivers/mtd/devices/doc2001plus.c b/drivers/mtd/devices/doc2001plus.c

deleted file mode 100644 (file)

index 4f2220a..0000000
--- a/drivers/mtd/devices/doc2001plus.c
+++ /dev/null
@@ -1,1080 +0,0 @@
-/*
- * Linux driver for Disk-On-Chip Millennium Plus
- *
- * (c) 2002-2003 Greg Ungerer <gerg@snapgear.com>
- * (c) 2002-2003 SnapGear Inc
- * (c) 1999 Machine Vision Holdings, Inc.
- * (c) 1999, 2000 David Woodhouse <dwmw2@infradead.org>
- *
- * Released under GPL
- */
-
-#include <linux/kernel.h>
-#include <linux/module.h>
-#include <asm/errno.h>
-#include <asm/io.h>
-#include <asm/uaccess.h>
-#include <linux/delay.h>
-#include <linux/slab.h>
-#include <linux/init.h>
-#include <linux/types.h>
-#include <linux/bitops.h>
-
-#include <linux/mtd/mtd.h>
-#include <linux/mtd/nand.h>
-#include <linux/mtd/doc2000.h>
-
-/* #define ECC_DEBUG */
-
-/* I have no idea why some DoC chips can not use memcop_form|to_io().
- * This may be due to the different revisions of the ASIC controller built-in or
- * simplily a QA/Bug issue. Who knows ?? If you have trouble, please uncomment
- * this:*/
-#undef USE_MEMCPY
-
-static int doc_read(struct mtd_info *mtd, loff_t from, size_t len,
-               size_t *retlen, u_char *buf);
-static int doc_write(struct mtd_info *mtd, loff_t to, size_t len,
-               size_t *retlen, const u_char *buf);
-static int doc_read_oob(struct mtd_info *mtd, loff_t ofs,
-                       struct mtd_oob_ops *ops);
-static int doc_write_oob(struct mtd_info *mtd, loff_t ofs,
-                        struct mtd_oob_ops *ops);
-static int doc_erase (struct mtd_info *mtd, struct erase_info *instr);
-
-static struct mtd_info *docmilpluslist = NULL;
-
-
-/* Perform the required delay cycles by writing to the NOP register */
-static void DoC_Delay(void __iomem * docptr, int cycles)
-{
-       int i;
-
-       for (i = 0; (i < cycles); i++)
-               WriteDOC(0, docptr, Mplus_NOP);
-}
-
-#define        CDSN_CTRL_FR_B_MASK     (CDSN_CTRL_FR_B0 | CDSN_CTRL_FR_B1)
-
-/* DOC_WaitReady: Wait for RDY line to be asserted by the flash chip */
-static int _DoC_WaitReady(void __iomem * docptr)
-{
-       unsigned int c = 0xffff;
-
-       pr_debug("_DoC_WaitReady called for out-of-line wait\n");
-
-       /* Out-of-line routine to wait for chip response */
-       while (((ReadDOC(docptr, Mplus_FlashControl) & CDSN_CTRL_FR_B_MASK) != CDSN_CTRL_FR_B_MASK) && --c)
-               ;
-
-       if (c == 0)
-               pr_debug("_DoC_WaitReady timed out.\n");
-
-       return (c == 0);
-}
-
-static inline int DoC_WaitReady(void __iomem * docptr)
-{
-       /* This is inline, to optimise the common case, where it's ready instantly */
-       int ret = 0;
-
-       /* read form NOP register should be issued prior to the read from CDSNControl
-          see Software Requirement 11.4 item 2. */
-       DoC_Delay(docptr, 4);
-
-       if ((ReadDOC(docptr, Mplus_FlashControl) & CDSN_CTRL_FR_B_MASK) != CDSN_CTRL_FR_B_MASK)
-               /* Call the out-of-line routine to wait */
-               ret = _DoC_WaitReady(docptr);
-
-       return ret;
-}
-
-/* For some reason the Millennium Plus seems to occasionally put itself
- * into reset mode. For me this happens randomly, with no pattern that I
- * can detect. M-systems suggest always check this on any block level
- * operation and setting to normal mode if in reset mode.
- */
-static inline void DoC_CheckASIC(void __iomem * docptr)
-{
-       /* Make sure the DoC is in normal mode */
-       if ((ReadDOC(docptr, Mplus_DOCControl) & DOC_MODE_NORMAL) == 0) {
-               WriteDOC((DOC_MODE_NORMAL | DOC_MODE_MDWREN), docptr, Mplus_DOCControl);
-               WriteDOC(~(DOC_MODE_NORMAL | DOC_MODE_MDWREN), docptr, Mplus_CtrlConfirm);
-       }
-}
-
-/* DoC_Command: Send a flash command to the flash chip through the Flash
- * command register. Need 2 Write Pipeline Terminates to complete send.
- */
-static void DoC_Command(void __iomem * docptr, unsigned char command,
-                              unsigned char xtraflags)
-{
-       WriteDOC(command, docptr, Mplus_FlashCmd);
-       WriteDOC(command, docptr, Mplus_WritePipeTerm);
-       WriteDOC(command, docptr, Mplus_WritePipeTerm);
-}
-
-/* DoC_Address: Set the current address for the flash chip through the Flash
- * Address register. Need 2 Write Pipeline Terminates to complete send.
- */
-static inline void DoC_Address(struct DiskOnChip *doc, int numbytes,
-                              unsigned long ofs, unsigned char xtraflags1,
-                              unsigned char xtraflags2)
-{
-       void __iomem * docptr = doc->virtadr;
-
-       /* Allow for possible Mill Plus internal flash interleaving */
-       ofs >>= doc->interleave;
-
-       switch (numbytes) {
-       case 1:
-               /* Send single byte, bits 0-7. */
-               WriteDOC(ofs & 0xff, docptr, Mplus_FlashAddress);
-               break;
-       case 2:
-               /* Send bits 9-16 followed by 17-23 */
-               WriteDOC((ofs >> 9)  & 0xff, docptr, Mplus_FlashAddress);
-               WriteDOC((ofs >> 17) & 0xff, docptr, Mplus_FlashAddress);
-               break;
-       case 3:
-               /* Send 0-7, 9-16, then 17-23 */
-               WriteDOC(ofs & 0xff, docptr, Mplus_FlashAddress);
-               WriteDOC((ofs >> 9)  & 0xff, docptr, Mplus_FlashAddress);
-               WriteDOC((ofs >> 17) & 0xff, docptr, Mplus_FlashAddress);
-               break;
-       default:
-               return;
-       }
-
-       WriteDOC(0x00, docptr, Mplus_WritePipeTerm);
-       WriteDOC(0x00, docptr, Mplus_WritePipeTerm);
-}
-
-/* DoC_SelectChip: Select a given flash chip within the current floor */
-static int DoC_SelectChip(void __iomem * docptr, int chip)
-{
-       /* No choice for flash chip on Millennium Plus */
-       return 0;
-}
-
-/* DoC_SelectFloor: Select a given floor (bank of flash chips) */
-static int DoC_SelectFloor(void __iomem * docptr, int floor)
-{
-       WriteDOC((floor & 0x3), docptr, Mplus_DeviceSelect);
-       return 0;
-}
-
-/*
- * Translate the given offset into the appropriate command and offset.
- * This does the mapping using the 16bit interleave layout defined by
- * M-Systems, and looks like this for a sector pair:
- *  +-----------+-------+-------+-------+--------------+---------+-----------+
- *  | 0 --- 511 |512-517|518-519|520-521| 522 --- 1033 |1034-1039|1040 - 1055|
- *  +-----------+-------+-------+-------+--------------+---------+-----------+
- *  | Data 0    | ECC 0 |Flags0 |Flags1 | Data 1       |ECC 1    | OOB 1 + 2 |
- *  +-----------+-------+-------+-------+--------------+---------+-----------+
- */
-/* FIXME: This lives in INFTL not here. Other users of flash devices
-   may not want it */
-static unsigned int DoC_GetDataOffset(struct mtd_info *mtd, loff_t *from)
-{
-       struct DiskOnChip *this = mtd->priv;
-
-       if (this->interleave) {
-               unsigned int ofs = *from & 0x3ff;
-               unsigned int cmd;
-
-               if (ofs < 512) {
-                       cmd = NAND_CMD_READ0;
-                       ofs &= 0x1ff;
-               } else if (ofs < 1014) {
-                       cmd = NAND_CMD_READ1;
-                       ofs = (ofs & 0x1ff) + 10;
-               } else {
-                       cmd = NAND_CMD_READOOB;
-                       ofs = ofs - 1014;
-               }
-
-               *from = (*from & ~0x3ff) | ofs;
-               return cmd;
-       } else {
-               /* No interleave */
-               if ((*from) & 0x100)
-                       return NAND_CMD_READ1;
-               return NAND_CMD_READ0;
-       }
-}
-
-static unsigned int DoC_GetECCOffset(struct mtd_info *mtd, loff_t *from)
-{
-       unsigned int ofs, cmd;
-
-       if (*from & 0x200) {
-               cmd = NAND_CMD_READOOB;
-               ofs = 10 + (*from & 0xf);
-       } else {
-               cmd = NAND_CMD_READ1;
-               ofs = (*from & 0xf);
-       }
-
-       *from = (*from & ~0x3ff) | ofs;
-       return cmd;
-}
-
-static unsigned int DoC_GetFlagsOffset(struct mtd_info *mtd, loff_t *from)
-{
-       unsigned int ofs, cmd;
-
-       cmd = NAND_CMD_READ1;
-       ofs = (*from & 0x200) ? 8 : 6;
-       *from = (*from & ~0x3ff) | ofs;
-       return cmd;
-}
-
-static unsigned int DoC_GetHdrOffset(struct mtd_info *mtd, loff_t *from)
-{
-       unsigned int ofs, cmd;
-
-       cmd = NAND_CMD_READOOB;
-       ofs = (*from & 0x200) ? 24 : 16;
-       *from = (*from & ~0x3ff) | ofs;
-       return cmd;
-}
-
-static inline void MemReadDOC(void __iomem * docptr, unsigned char *buf, int len)
-{
-#ifndef USE_MEMCPY
-       int i;
-       for (i = 0; i < len; i++)
-               buf[i] = ReadDOC(docptr, Mil_CDSN_IO + i);
-#else
-       memcpy_fromio(buf, docptr + DoC_Mil_CDSN_IO, len);
-#endif
-}
-
-static inline void MemWriteDOC(void __iomem * docptr, unsigned char *buf, int len)
-{
-#ifndef USE_MEMCPY
-       int i;
-       for (i = 0; i < len; i++)
-               WriteDOC(buf[i], docptr, Mil_CDSN_IO + i);
-#else
-       memcpy_toio(docptr + DoC_Mil_CDSN_IO, buf, len);
-#endif
-}
-
-/* DoC_IdentChip: Identify a given NAND chip given {floor,chip} */
-static int DoC_IdentChip(struct DiskOnChip *doc, int floor, int chip)
-{
-       int mfr, id, i, j;
-       volatile char dummy;
-       void __iomem * docptr = doc->virtadr;
-
-       /* Page in the required floor/chip */
-       DoC_SelectFloor(docptr, floor);
-       DoC_SelectChip(docptr, chip);
-
-       /* Millennium Plus bus cycle sequence as per figure 2, section 2.4 */
-       WriteDOC((DOC_FLASH_CE | DOC_FLASH_WP), docptr, Mplus_FlashSelect);
-
-       /* Reset the chip, see Software Requirement 11.4 item 1. */
-       DoC_Command(docptr, NAND_CMD_RESET, 0);
-       DoC_WaitReady(docptr);
-
-       /* Read the NAND chip ID: 1. Send ReadID command */
-       DoC_Command(docptr, NAND_CMD_READID, 0);
-
-       /* Read the NAND chip ID: 2. Send address byte zero */
-       DoC_Address(doc, 1, 0x00, 0, 0x00);
-
-       WriteDOC(0, docptr, Mplus_FlashControl);
-       DoC_WaitReady(docptr);
-
-       /* Read the manufacturer and device id codes of the flash device through
-          CDSN IO register see Software Requirement 11.4 item 5.*/
-       dummy = ReadDOC(docptr, Mplus_ReadPipeInit);
-       dummy = ReadDOC(docptr, Mplus_ReadPipeInit);
-
-       mfr = ReadDOC(docptr, Mil_CDSN_IO);
-       if (doc->interleave)
-               dummy = ReadDOC(docptr, Mil_CDSN_IO); /* 2 way interleave */
-
-       id  = ReadDOC(docptr, Mil_CDSN_IO);
-       if (doc->interleave)
-               dummy = ReadDOC(docptr, Mil_CDSN_IO); /* 2 way interleave */
-
-       dummy = ReadDOC(docptr, Mplus_LastDataRead);
-       dummy = ReadDOC(docptr, Mplus_LastDataRead);
-
-       /* Disable flash internally */
-       WriteDOC(0, docptr, Mplus_FlashSelect);
-
-       /* No response - return failure */
-       if (mfr == 0xff || mfr == 0)
-               return 0;
-
-       for (i = 0; nand_flash_ids[i].name != NULL; i++) {
-               if (id == nand_flash_ids[i].id) {
-                       /* Try to identify manufacturer */
-                       for (j = 0; nand_manuf_ids[j].id != 0x0; j++) {
-                               if (nand_manuf_ids[j].id == mfr)
-                                       break;
-                       }
-                       printk(KERN_INFO "Flash chip found: Manufacturer ID: %2.2X, "
-                              "Chip ID: %2.2X (%s:%s)\n", mfr, id,
-                              nand_manuf_ids[j].name, nand_flash_ids[i].name);
-                       doc->mfr = mfr;
-                       doc->id = id;
-                       doc->chipshift = ffs((nand_flash_ids[i].chipsize << 20)) - 1;
-                       doc->erasesize = nand_flash_ids[i].erasesize << doc->interleave;
-                       break;
-               }
-       }
-
-       if (nand_flash_ids[i].name == NULL)
-               return 0;
-       return 1;
-}
-
-/* DoC_ScanChips: Find all NAND chips present in a DiskOnChip, and identify them */
-static void DoC_ScanChips(struct DiskOnChip *this)
-{
-       int floor, chip;
-       int numchips[MAX_FLOORS_MPLUS];
-       int ret;
-
-       this->numchips = 0;
-       this->mfr = 0;
-       this->id = 0;
-
-       /* Work out the intended interleave setting */
-       this->interleave = 0;
-       if (this->ChipID == DOC_ChipID_DocMilPlus32)
-               this->interleave = 1;
-
-       /* Check the ASIC agrees */
-       if ( (this->interleave << 2) !=
-            (ReadDOC(this->virtadr, Mplus_Configuration) & 4)) {
-               u_char conf = ReadDOC(this->virtadr, Mplus_Configuration);
-               printk(KERN_NOTICE "Setting DiskOnChip Millennium Plus interleave to %s\n",
-                      this->interleave?"on (16-bit)":"off (8-bit)");
-               conf ^= 4;
-               WriteDOC(conf, this->virtadr, Mplus_Configuration);
-       }
-
-       /* For each floor, find the number of valid chips it contains */
-       for (floor = 0,ret = 1; floor < MAX_FLOORS_MPLUS; floor++) {
-               numchips[floor] = 0;
-               for (chip = 0; chip < MAX_CHIPS_MPLUS && ret != 0; chip++) {
-                       ret = DoC_IdentChip(this, floor, chip);
-                       if (ret) {
-                               numchips[floor]++;
-                               this->numchips++;
-                       }
-               }
-       }
-       /* If there are none at all that we recognise, bail */
-       if (!this->numchips) {
-               printk("No flash chips recognised.\n");
-               return;
-       }
-
-       /* Allocate an array to hold the information for each chip */
-       this->chips = kmalloc(sizeof(struct Nand) * this->numchips, GFP_KERNEL);
-       if (!this->chips){
-               printk("MTD: No memory for allocating chip info structures\n");
-               return;
-       }
-
-       /* Fill out the chip array with {floor, chipno} for each
-        * detected chip in the device. */
-       for (floor = 0, ret = 0; floor < MAX_FLOORS_MPLUS; floor++) {
-               for (chip = 0 ; chip < numchips[floor] ; chip++) {
-                       this->chips[ret].floor = floor;
-                       this->chips[ret].chip = chip;
-                       this->chips[ret].curadr = 0;
-                       this->chips[ret].curmode = 0x50;
-                       ret++;
-               }
-       }
-
-       /* Calculate and print the total size of the device */
-       this->totlen = this->numchips * (1 << this->chipshift);
-       printk(KERN_INFO "%d flash chips found. Total DiskOnChip size: %ld MiB\n",
-              this->numchips ,this->totlen >> 20);
-}
-
-static int DoCMilPlus_is_alias(struct DiskOnChip *doc1, struct DiskOnChip *doc2)
-{
-       int tmp1, tmp2, retval;
-
-       if (doc1->physadr == doc2->physadr)
-               return 1;
-
-       /* Use the alias resolution register which was set aside for this
-        * purpose. If it's value is the same on both chips, they might
-        * be the same chip, and we write to one and check for a change in
-        * the other. It's unclear if this register is usuable in the
-        * DoC 2000 (it's in the Millennium docs), but it seems to work. */
-       tmp1 = ReadDOC(doc1->virtadr, Mplus_AliasResolution);
-       tmp2 = ReadDOC(doc2->virtadr, Mplus_AliasResolution);
-       if (tmp1 != tmp2)
-               return 0;
-
-       WriteDOC((tmp1+1) % 0xff, doc1->virtadr, Mplus_AliasResolution);
-       tmp2 = ReadDOC(doc2->virtadr, Mplus_AliasResolution);
-       if (tmp2 == (tmp1+1) % 0xff)
-               retval = 1;
-       else
-               retval = 0;
-
-       /* Restore register contents.  May not be necessary, but do it just to
-        * be safe. */
-       WriteDOC(tmp1, doc1->virtadr, Mplus_AliasResolution);
-
-       return retval;
-}
-
-/* This routine is found from the docprobe code by symbol_get(),
- * which will bump the use count of this module. */
-void DoCMilPlus_init(struct mtd_info *mtd)
-{
-       struct DiskOnChip *this = mtd->priv;
-       struct DiskOnChip *old = NULL;
-
-       /* We must avoid being called twice for the same device. */
-       if (docmilpluslist)
-               old = docmilpluslist->priv;
-
-       while (old) {
-               if (DoCMilPlus_is_alias(this, old)) {
-                       printk(KERN_NOTICE "Ignoring DiskOnChip Millennium "
-                               "Plus at 0x%lX - already configured\n",
-                               this->physadr);
-                       iounmap(this->virtadr);
-                       kfree(mtd);
-                       return;
-               }
-               if (old->nextdoc)
-                       old = old->nextdoc->priv;
-               else
-                       old = NULL;
-       }
-
-       mtd->name = "DiskOnChip Millennium Plus";
-       printk(KERN_NOTICE "DiskOnChip Millennium Plus found at "
-               "address 0x%lX\n", this->physadr);
-
-       mtd->type = MTD_NANDFLASH;
-       mtd->flags = MTD_CAP_NANDFLASH;
-       mtd->writebufsize = mtd->writesize = 512;
-       mtd->oobsize = 16;
-       mtd->ecc_strength = 2;
-       mtd->owner = THIS_MODULE;
-       mtd->_erase = doc_erase;
-       mtd->_read = doc_read;
-       mtd->_write = doc_write;
-       mtd->_read_oob = doc_read_oob;
-       mtd->_write_oob = doc_write_oob;
-       this->curfloor = -1;
-       this->curchip = -1;
-
-       /* Ident all the chips present. */
-       DoC_ScanChips(this);
-
-       if (!this->totlen) {
-               kfree(mtd);
-               iounmap(this->virtadr);
-       } else {
-               this->nextdoc = docmilpluslist;
-               docmilpluslist = mtd;
-               mtd->size  = this->totlen;
-               mtd->erasesize = this->erasesize;
-               mtd_device_register(mtd, NULL, 0);
-               return;
-       }
-}
-EXPORT_SYMBOL_GPL(DoCMilPlus_init);
-
-#if 0
-static int doc_dumpblk(struct mtd_info *mtd, loff_t from)
-{
-       int i;
-       loff_t fofs;
-       struct DiskOnChip *this = mtd->priv;
-       void __iomem * docptr = this->virtadr;
-       struct Nand *mychip = &this->chips[from >> (this->chipshift)];
-       unsigned char *bp, buf[1056];
-       char c[32];
-
-       from &= ~0x3ff;
-
-       /* Don't allow read past end of device */
-       if (from >= this->totlen)
-               return -EINVAL;
-
-       DoC_CheckASIC(docptr);
-
-       /* Find the chip which is to be used and select it */
-       if (this->curfloor != mychip->floor) {
-               DoC_SelectFloor(docptr, mychip->floor);
-               DoC_SelectChip(docptr, mychip->chip);
-       } else if (this->curchip != mychip->chip) {
-               DoC_SelectChip(docptr, mychip->chip);
-       }
-       this->curfloor = mychip->floor;
-       this->curchip = mychip->chip;
-
-       /* Millennium Plus bus cycle sequence as per figure 2, section 2.4 */
-       WriteDOC((DOC_FLASH_CE | DOC_FLASH_WP), docptr, Mplus_FlashSelect);
-
-       /* Reset the chip, see Software Requirement 11.4 item 1. */
-       DoC_Command(docptr, NAND_CMD_RESET, 0);
-       DoC_WaitReady(docptr);
-
-       fofs = from;
-       DoC_Command(docptr, DoC_GetDataOffset(mtd, &fofs), 0);
-       DoC_Address(this, 3, fofs, 0, 0x00);
-       WriteDOC(0, docptr, Mplus_FlashControl);
-       DoC_WaitReady(docptr);
-
-       /* disable the ECC engine */
-       WriteDOC(DOC_ECC_RESET, docptr, Mplus_ECCConf);
-
-       ReadDOC(docptr, Mplus_ReadPipeInit);
-       ReadDOC(docptr, Mplus_ReadPipeInit);
-
-       /* Read the data via the internal pipeline through CDSN IO
-          register, see Pipelined Read Operations 11.3 */
-       MemReadDOC(docptr, buf, 1054);
-       buf[1054] = ReadDOC(docptr, Mplus_LastDataRead);
-       buf[1055] = ReadDOC(docptr, Mplus_LastDataRead);
-
-       memset(&c[0], 0, sizeof(c));
-       printk("DUMP OFFSET=%x:\n", (int)from);
-
-        for (i = 0, bp = &buf[0]; (i < 1056); i++) {
-                if ((i % 16) == 0)
-                        printk("%08x: ", i);
-                printk(" %02x", *bp);
-                c[(i & 0xf)] = ((*bp >= 0x20) && (*bp <= 0x7f)) ? *bp : '.';
-                bp++;
-                if (((i + 1) % 16) == 0)
-                        printk("    %s\n", c);
-        }
-       printk("\n");
-
-       /* Disable flash internally */
-       WriteDOC(0, docptr, Mplus_FlashSelect);
-
-       return 0;
-}
-#endif
-
-static int doc_read(struct mtd_info *mtd, loff_t from, size_t len,
-                   size_t *retlen, u_char *buf)
-{
-       int ret, i;
-       volatile char dummy;
-       loff_t fofs;
-       unsigned char syndrome[6], eccbuf[6];
-       struct DiskOnChip *this = mtd->priv;
-       void __iomem * docptr = this->virtadr;
-       struct Nand *mychip = &this->chips[from >> (this->chipshift)];
-
-       /* Don't allow a single read to cross a 512-byte block boundary */
-       if (from + len > ((from | 0x1ff) + 1))
-               len = ((from | 0x1ff) + 1) - from;
-
-       DoC_CheckASIC(docptr);
-
-       /* Find the chip which is to be used and select it */
-       if (this->curfloor != mychip->floor) {
-               DoC_SelectFloor(docptr, mychip->floor);
-               DoC_SelectChip(docptr, mychip->chip);
-       } else if (this->curchip != mychip->chip) {
-               DoC_SelectChip(docptr, mychip->chip);
-       }
-       this->curfloor = mychip->floor;
-       this->curchip = mychip->chip;
-
-       /* Millennium Plus bus cycle sequence as per figure 2, section 2.4 */
-       WriteDOC((DOC_FLASH_CE | DOC_FLASH_WP), docptr, Mplus_FlashSelect);
-
-       /* Reset the chip, see Software Requirement 11.4 item 1. */
-       DoC_Command(docptr, NAND_CMD_RESET, 0);
-       DoC_WaitReady(docptr);
-
-       fofs = from;
-       DoC_Command(docptr, DoC_GetDataOffset(mtd, &fofs), 0);
-       DoC_Address(this, 3, fofs, 0, 0x00);
-       WriteDOC(0, docptr, Mplus_FlashControl);
-       DoC_WaitReady(docptr);
-
-       /* init the ECC engine, see Reed-Solomon EDC/ECC 11.1 .*/
-       WriteDOC(DOC_ECC_RESET, docptr, Mplus_ECCConf);
-       WriteDOC(DOC_ECC_EN, docptr, Mplus_ECCConf);
-
-       /* Let the caller know we completed it */
-       *retlen = len;
-       ret = 0;
-
-       ReadDOC(docptr, Mplus_ReadPipeInit);
-       ReadDOC(docptr, Mplus_ReadPipeInit);
-
-       /* Read the data via the internal pipeline through CDSN IO
-          register, see Pipelined Read Operations 11.3 */
-       MemReadDOC(docptr, buf, len);
-
-       /* Read the ECC data following raw data */
-       MemReadDOC(docptr, eccbuf, 4);
-       eccbuf[4] = ReadDOC(docptr, Mplus_LastDataRead);
-       eccbuf[5] = ReadDOC(docptr, Mplus_LastDataRead);
-
-       /* Flush the pipeline */
-       dummy = ReadDOC(docptr, Mplus_ECCConf);
-       dummy = ReadDOC(docptr, Mplus_ECCConf);
-
-       /* Check the ECC Status */
-       if (ReadDOC(docptr, Mplus_ECCConf) & 0x80) {
-               int nb_errors;
-               /* There was an ECC error */
-#ifdef ECC_DEBUG
-               printk("DiskOnChip ECC Error: Read at %lx\n", (long)from);
-#endif
-               /* Read the ECC syndrome through the DiskOnChip ECC logic.
-                  These syndrome will be all ZERO when there is no error */
-               for (i = 0; i < 6; i++)
-                       syndrome[i] = ReadDOC(docptr, Mplus_ECCSyndrome0 + i);
-
-               nb_errors = doc_decode_ecc(buf, syndrome);
-#ifdef ECC_DEBUG
-               printk("ECC Errors corrected: %x\n", nb_errors);
-#endif
-               if (nb_errors < 0) {
-                       /* We return error, but have actually done the
-                          read. Not that this can be told to user-space, via
-                          sys_read(), but at least MTD-aware stuff can know
-                          about it by checking *retlen */
-#ifdef ECC_DEBUG
-                       printk("%s(%d): Millennium Plus ECC error (from=0x%x:\n",
-                               __FILE__, __LINE__, (int)from);
-                       printk("        syndrome= %*phC\n", 6, syndrome);
-                       printk("        eccbuf= %*phC\n", 6, eccbuf);
-#endif
-                               ret = -EIO;
-               }
-       }
-
-#ifdef PSYCHO_DEBUG
-       printk("ECC DATA at %lx: %*ph\n", (long)from, 6, eccbuf);
-#endif
-       /* disable the ECC engine */
-       WriteDOC(DOC_ECC_DIS, docptr , Mplus_ECCConf);
-
-       /* Disable flash internally */
-       WriteDOC(0, docptr, Mplus_FlashSelect);
-
-       return ret;
-}
-
-static int doc_write(struct mtd_info *mtd, loff_t to, size_t len,
-                    size_t *retlen, const u_char *buf)
-{
-       int i, before, ret = 0;
-       loff_t fto;
-       volatile char dummy;
-       char eccbuf[6];
-       struct DiskOnChip *this = mtd->priv;
-       void __iomem * docptr = this->virtadr;
-       struct Nand *mychip = &this->chips[to >> (this->chipshift)];
-
-       /* Don't allow writes which aren't exactly one block (512 bytes) */
-       if ((to & 0x1ff) || (len != 0x200))
-               return -EINVAL;
-
-       /* Determine position of OOB flags, before or after data */
-       before = (this->interleave && (to & 0x200));
-
-       DoC_CheckASIC(docptr);
-
-       /* Find the chip which is to be used and select it */
-       if (this->curfloor != mychip->floor) {
-               DoC_SelectFloor(docptr, mychip->floor);
-               DoC_SelectChip(docptr, mychip->chip);
-       } else if (this->curchip != mychip->chip) {
-               DoC_SelectChip(docptr, mychip->chip);
-       }
-       this->curfloor = mychip->floor;
-       this->curchip = mychip->chip;
-
-       /* Millennium Plus bus cycle sequence as per figure 2, section 2.4 */
-       WriteDOC(DOC_FLASH_CE, docptr, Mplus_FlashSelect);
-
-       /* Reset the chip, see Software Requirement 11.4 item 1. */
-       DoC_Command(docptr, NAND_CMD_RESET, 0);
-       DoC_WaitReady(docptr);
-
-       /* Set device to appropriate plane of flash */
-       fto = to;
-       WriteDOC(DoC_GetDataOffset(mtd, &fto), docptr, Mplus_FlashCmd);
-
-       /* On interleaved devices the flags for 2nd half 512 are before data */
-       if (before)
-               fto -= 2;
-
-       /* issue the Serial Data In command to initial the Page Program process */
-       DoC_Command(docptr, NAND_CMD_SEQIN, 0x00);
-       DoC_Address(this, 3, fto, 0x00, 0x00);
-
-       /* Disable the ECC engine */
-       WriteDOC(DOC_ECC_RESET, docptr, Mplus_ECCConf);
-
-       if (before) {
-               /* Write the block status BLOCK_USED (0x5555) */
-               WriteDOC(0x55, docptr, Mil_CDSN_IO);
-               WriteDOC(0x55, docptr, Mil_CDSN_IO);
-       }
-
-       /* init the ECC engine, see Reed-Solomon EDC/ECC 11.1 .*/
-       WriteDOC(DOC_ECC_EN | DOC_ECC_RW, docptr, Mplus_ECCConf);
-
-       MemWriteDOC(docptr, (unsigned char *) buf, len);
-
-       /* Write ECC data to flash, the ECC info is generated by
-          the DiskOnChip ECC logic see Reed-Solomon EDC/ECC 11.1 */
-       DoC_Delay(docptr, 3);
-
-       /* Read the ECC data through the DiskOnChip ECC logic */
-       for (i = 0; i < 6; i++)
-               eccbuf[i] = ReadDOC(docptr, Mplus_ECCSyndrome0 + i);
-
-       /* disable the ECC engine */
-       WriteDOC(DOC_ECC_DIS, docptr, Mplus_ECCConf);
-
-       /* Write the ECC data to flash */
-       MemWriteDOC(docptr, eccbuf, 6);
-
-       if (!before) {
-               /* Write the block status BLOCK_USED (0x5555) */
-               WriteDOC(0x55, docptr, Mil_CDSN_IO+6);
-               WriteDOC(0x55, docptr, Mil_CDSN_IO+7);
-       }
-
-#ifdef PSYCHO_DEBUG
-       printk("OOB data at %lx is %2.2X %2.2X %2.2X %2.2X %2.2X %2.2X\n",
-              (long) to, eccbuf[0], eccbuf[1], eccbuf[2], eccbuf[3],
-              eccbuf[4], eccbuf[5]);
-#endif
-
-       WriteDOC(0x00, docptr, Mplus_WritePipeTerm);
-       WriteDOC(0x00, docptr, Mplus_WritePipeTerm);
-
-       /* Commit the Page Program command and wait for ready
-          see Software Requirement 11.4 item 1.*/
-       DoC_Command(docptr, NAND_CMD_PAGEPROG, 0x00);
-       DoC_WaitReady(docptr);
-
-       /* Read the status of the flash device through CDSN IO register
-          see Software Requirement 11.4 item 5.*/
-       DoC_Command(docptr, NAND_CMD_STATUS, 0);
-       dummy = ReadDOC(docptr, Mplus_ReadPipeInit);
-       dummy = ReadDOC(docptr, Mplus_ReadPipeInit);
-       DoC_Delay(docptr, 2);
-       if ((dummy = ReadDOC(docptr, Mplus_LastDataRead)) & 1) {
-               printk("MTD: Error 0x%x programming at 0x%x\n", dummy, (int)to);
-               /* Error in programming
-                  FIXME: implement Bad Block Replacement (in nftl.c ??) */
-               ret = -EIO;
-       }
-       dummy = ReadDOC(docptr, Mplus_LastDataRead);
-
-       /* Disable flash internally */
-       WriteDOC(0, docptr, Mplus_FlashSelect);
-
-       /* Let the caller know we completed it */
-       *retlen = len;
-
-       return ret;
-}
-
-static int doc_read_oob(struct mtd_info *mtd, loff_t ofs,
-                       struct mtd_oob_ops *ops)
-{
-       loff_t fofs, base;
-       struct DiskOnChip *this = mtd->priv;
-       void __iomem * docptr = this->virtadr;
-       struct Nand *mychip = &this->chips[ofs >> this->chipshift];
-       size_t i, size, got, want;
-       uint8_t *buf = ops->oobbuf;
-       size_t len = ops->len;
-
-       BUG_ON(ops->mode != MTD_OPS_PLACE_OOB);
-
-       ofs += ops->ooboffs;
-
-       DoC_CheckASIC(docptr);
-
-       /* Find the chip which is to be used and select it */
-       if (this->curfloor != mychip->floor) {
-               DoC_SelectFloor(docptr, mychip->floor);
-               DoC_SelectChip(docptr, mychip->chip);
-       } else if (this->curchip != mychip->chip) {
-               DoC_SelectChip(docptr, mychip->chip);
-       }
-       this->curfloor = mychip->floor;
-       this->curchip = mychip->chip;
-
-       /* Millennium Plus bus cycle sequence as per figure 2, section 2.4 */
-       WriteDOC((DOC_FLASH_CE | DOC_FLASH_WP), docptr, Mplus_FlashSelect);
-
-       /* disable the ECC engine */
-       WriteDOC(DOC_ECC_RESET, docptr, Mplus_ECCConf);
-       DoC_WaitReady(docptr);
-
-       /* Maximum of 16 bytes in the OOB region, so limit read to that */
-       if (len > 16)
-               len = 16;
-       got = 0;
-       want = len;
-
-       for (i = 0; ((i < 3) && (want > 0)); i++) {
-               /* Figure out which region we are accessing... */
-               fofs = ofs;
-               base = ofs & 0xf;
-               if (!this->interleave) {
-                       DoC_Command(docptr, NAND_CMD_READOOB, 0);
-                       size = 16 - base;
-               } else if (base < 6) {
-                       DoC_Command(docptr, DoC_GetECCOffset(mtd, &fofs), 0);
-                       size = 6 - base;
-               } else if (base < 8) {
-                       DoC_Command(docptr, DoC_GetFlagsOffset(mtd, &fofs), 0);
-                       size = 8 - base;
-               } else {
-                       DoC_Command(docptr, DoC_GetHdrOffset(mtd, &fofs), 0);
-                       size = 16 - base;
-               }
-               if (size > want)
-                       size = want;
-
-               /* Issue read command */
-               DoC_Address(this, 3, fofs, 0, 0x00);
-               WriteDOC(0, docptr, Mplus_FlashControl);
-               DoC_WaitReady(docptr);
-
-               ReadDOC(docptr, Mplus_ReadPipeInit);
-               ReadDOC(docptr, Mplus_ReadPipeInit);
-               MemReadDOC(docptr, &buf[got], size - 2);
-               buf[got + size - 2] = ReadDOC(docptr, Mplus_LastDataRead);
-               buf[got + size - 1] = ReadDOC(docptr, Mplus_LastDataRead);
-
-               ofs += size;
-               got += size;
-               want -= size;
-       }
-
-       /* Disable flash internally */
-       WriteDOC(0, docptr, Mplus_FlashSelect);
-
-       ops->retlen = len;
-       return 0;
-}
-
-static int doc_write_oob(struct mtd_info *mtd, loff_t ofs,
-                        struct mtd_oob_ops *ops)
-{
-       volatile char dummy;
-       loff_t fofs, base;
-       struct DiskOnChip *this = mtd->priv;
-       void __iomem * docptr = this->virtadr;
-       struct Nand *mychip = &this->chips[ofs >> this->chipshift];
-       size_t i, size, got, want;
-       int ret = 0;
-       uint8_t *buf = ops->oobbuf;
-       size_t len = ops->len;
-
-       BUG_ON(ops->mode != MTD_OPS_PLACE_OOB);
-
-       ofs += ops->ooboffs;
-
-       DoC_CheckASIC(docptr);
-
-       /* Find the chip which is to be used and select it */
-       if (this->curfloor != mychip->floor) {
-               DoC_SelectFloor(docptr, mychip->floor);
-               DoC_SelectChip(docptr, mychip->chip);
-       } else if (this->curchip != mychip->chip) {
-               DoC_SelectChip(docptr, mychip->chip);
-       }
-       this->curfloor = mychip->floor;
-       this->curchip = mychip->chip;
-
-       /* Millennium Plus bus cycle sequence as per figure 2, section 2.4 */
-       WriteDOC(DOC_FLASH_CE, docptr, Mplus_FlashSelect);
-
-
-       /* Maximum of 16 bytes in the OOB region, so limit write to that */
-       if (len > 16)
-               len = 16;
-       got = 0;
-       want = len;
-
-       for (i = 0; ((i < 3) && (want > 0)); i++) {
-               /* Reset the chip, see Software Requirement 11.4 item 1. */
-               DoC_Command(docptr, NAND_CMD_RESET, 0);
-               DoC_WaitReady(docptr);
-
-               /* Figure out which region we are accessing... */
-               fofs = ofs;
-               base = ofs & 0x0f;
-               if (!this->interleave) {
-                       WriteDOC(NAND_CMD_READOOB, docptr, Mplus_FlashCmd);
-                       size = 16 - base;
-               } else if (base < 6) {
-                       WriteDOC(DoC_GetECCOffset(mtd, &fofs), docptr, Mplus_FlashCmd);
-                       size = 6 - base;
-               } else if (base < 8) {
-                       WriteDOC(DoC_GetFlagsOffset(mtd, &fofs), docptr, Mplus_FlashCmd);
-                       size = 8 - base;
-               } else {
-                       WriteDOC(DoC_GetHdrOffset(mtd, &fofs), docptr, Mplus_FlashCmd);
-                       size = 16 - base;
-               }
-               if (size > want)
-                       size = want;
-
-               /* Issue the Serial Data In command to initial the Page Program process */
-               DoC_Command(docptr, NAND_CMD_SEQIN, 0x00);
-               DoC_Address(this, 3, fofs, 0, 0x00);
-
-               /* Disable the ECC engine */
-               WriteDOC(DOC_ECC_RESET, docptr, Mplus_ECCConf);
-
-               /* Write the data via the internal pipeline through CDSN IO
-                  register, see Pipelined Write Operations 11.2 */
-               MemWriteDOC(docptr, (unsigned char *) &buf[got], size);
-               WriteDOC(0x00, docptr, Mplus_WritePipeTerm);
-               WriteDOC(0x00, docptr, Mplus_WritePipeTerm);
-
-               /* Commit the Page Program command and wait for ready
-                  see Software Requirement 11.4 item 1.*/
-               DoC_Command(docptr, NAND_CMD_PAGEPROG, 0x00);
-               DoC_WaitReady(docptr);
-
-               /* Read the status of the flash device through CDSN IO register
-                  see Software Requirement 11.4 item 5.*/
-               DoC_Command(docptr, NAND_CMD_STATUS, 0x00);
-               dummy = ReadDOC(docptr, Mplus_ReadPipeInit);
-               dummy = ReadDOC(docptr, Mplus_ReadPipeInit);
-               DoC_Delay(docptr, 2);
-               if ((dummy = ReadDOC(docptr, Mplus_LastDataRead)) & 1) {
-                       printk("MTD: Error 0x%x programming oob at 0x%x\n",
-                               dummy, (int)ofs);
-                       /* FIXME: implement Bad Block Replacement */
-                       ops->retlen = 0;
-                       ret = -EIO;
-               }
-               dummy = ReadDOC(docptr, Mplus_LastDataRead);
-
-               ofs += size;
-               got += size;
-               want -= size;
-       }
-
-       /* Disable flash internally */
-       WriteDOC(0, docptr, Mplus_FlashSelect);
-
-       ops->retlen = len;
-       return ret;
-}
-
-int doc_erase(struct mtd_info *mtd, struct erase_info *instr)
-{
-       volatile char dummy;
-       struct DiskOnChip *this = mtd->priv;
-       __u32 ofs = instr->addr;
-       __u32 len = instr->len;
-       void __iomem * docptr = this->virtadr;
-       struct Nand *mychip = &this->chips[ofs >> this->chipshift];
-
-       DoC_CheckASIC(docptr);
-
-       if (len != mtd->erasesize)
-               printk(KERN_WARNING "MTD: Erase not right size (%x != %x)n",
-                      len, mtd->erasesize);
-
-       /* Find the chip which is to be used and select it */
-       if (this->curfloor != mychip->floor) {
-               DoC_SelectFloor(docptr, mychip->floor);
-               DoC_SelectChip(docptr, mychip->chip);
-       } else if (this->curchip != mychip->chip) {
-               DoC_SelectChip(docptr, mychip->chip);
-       }
-       this->curfloor = mychip->floor;
-       this->curchip = mychip->chip;
-
-       instr->state = MTD_ERASE_PENDING;
-
-       /* Millennium Plus bus cycle sequence as per figure 2, section 2.4 */
-       WriteDOC(DOC_FLASH_CE, docptr, Mplus_FlashSelect);
-
-       DoC_Command(docptr, NAND_CMD_RESET, 0x00);
-       DoC_WaitReady(docptr);
-
-       DoC_Command(docptr, NAND_CMD_ERASE1, 0);
-       DoC_Address(this, 2, ofs, 0, 0x00);
-       DoC_Command(docptr, NAND_CMD_ERASE2, 0);
-       DoC_WaitReady(docptr);
-       instr->state = MTD_ERASING;
-
-       /* Read the status of the flash device through CDSN IO register
-          see Software Requirement 11.4 item 5. */
-       DoC_Command(docptr, NAND_CMD_STATUS, 0);
-       dummy = ReadDOC(docptr, Mplus_ReadPipeInit);
-       dummy = ReadDOC(docptr, Mplus_ReadPipeInit);
-       if ((dummy = ReadDOC(docptr, Mplus_LastDataRead)) & 1) {
-               printk("MTD: Error 0x%x erasing at 0x%x\n", dummy, ofs);
-               /* FIXME: implement Bad Block Replacement (in nftl.c ??) */
-               instr->state = MTD_ERASE_FAILED;
-       } else {
-               instr->state = MTD_ERASE_DONE;
-       }
-       dummy = ReadDOC(docptr, Mplus_LastDataRead);
-
-       /* Disable flash internally */
-       WriteDOC(0, docptr, Mplus_FlashSelect);
-
-       mtd_erase_callback(instr);
-
-       return 0;
-}
-
-/****************************************************************************
- *
- * Module stuff
- *
- ****************************************************************************/
-
-static void __exit cleanup_doc2001plus(void)
-{
-       struct mtd_info *mtd;
-       struct DiskOnChip *this;
-
-       while ((mtd=docmilpluslist)) {
-               this = mtd->priv;
-               docmilpluslist = this->nextdoc;
-
-               mtd_device_unregister(mtd);
-
-               iounmap(this->virtadr);
-               kfree(this->chips);
-               kfree(mtd);
-       }
-}
-
-module_exit(cleanup_doc2001plus);
-
-MODULE_LICENSE("GPL");
-MODULE_AUTHOR("Greg Ungerer <gerg@snapgear.com> et al.");
-MODULE_DESCRIPTION("Driver for DiskOnChip Millennium Plus");
diff --git a/drivers/mtd/devices/docecc.c b/drivers/mtd/devices/docecc.c

deleted file mode 100644 (file)

index 4a1c39b..0000000
--- a/drivers/mtd/devices/docecc.c
+++ /dev/null
@@ -1,521 +0,0 @@
-/*
- * ECC algorithm for M-systems disk on chip. We use the excellent Reed
- * Solmon code of Phil Karn (karn@ka9q.ampr.org) available under the
- * GNU GPL License. The rest is simply to convert the disk on chip
- * syndrome into a standard syndome.
- *
- * Author: Fabrice Bellard (fabrice.bellard@netgem.com)
- * Copyright (C) 2000 Netgem S.A.
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License as published by
- * the Free Software Foundation; either version 2 of the License, or
- * (at your option) any later version.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, write to the Free Software
- * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
- */
-#include <linux/kernel.h>
-#include <linux/module.h>
-#include <asm/errno.h>
-#include <asm/io.h>
-#include <asm/uaccess.h>
-#include <linux/delay.h>
-#include <linux/slab.h>
-#include <linux/init.h>
-#include <linux/types.h>
-
-#include <linux/mtd/mtd.h>
-#include <linux/mtd/doc2000.h>
-
-#define DEBUG_ECC 0
-/* need to undef it (from asm/termbits.h) */
-#undef B0
-
-#define MM 10 /* Symbol size in bits */
-#define KK (1023-4) /* Number of data symbols per block */
-#define B0 510 /* First root of generator polynomial, alpha form */
-#define PRIM 1 /* power of alpha used to generate roots of generator poly */
-#define        NN ((1 << MM) - 1)
-
-typedef unsigned short dtype;
-
-/* 1+x^3+x^10 */
-static const int Pp[MM+1] = { 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1 };
-
-/* This defines the type used to store an element of the Galois Field
- * used by the code. Make sure this is something larger than a char if
- * if anything larger than GF(256) is used.
- *
- * Note: unsigned char will work up to GF(256) but int seems to run
- * faster on the Pentium.
- */
-typedef int gf;
-
-/* No legal value in index form represents zero, so
- * we need a special value for this purpose
- */
-#define A0     (NN)
-
-/* Compute x % NN, where NN is 2**MM - 1,
- * without a slow divide
- */
-static inline gf
-modnn(int x)
-{
-  while (x >= NN) {
-    x -= NN;
-    x = (x >> MM) + (x & NN);
-  }
-  return x;
-}
-
-#define        CLEAR(a,n) {\
-int ci;\
-for(ci=(n)-1;ci >=0;ci--)\
-(a)[ci] = 0;\
-}
-
-#define        COPY(a,b,n) {\
-int ci;\
-for(ci=(n)-1;ci >=0;ci--)\
-(a)[ci] = (b)[ci];\
-}
-
-#define        COPYDOWN(a,b,n) {\
-int ci;\
-for(ci=(n)-1;ci >=0;ci--)\
-(a)[ci] = (b)[ci];\
-}
-
-#define Ldec 1
-
-/* generate GF(2**m) from the irreducible polynomial p(X) in Pp[0]..Pp[m]
-   lookup tables:  index->polynomial form   alpha_to[] contains j=alpha**i;
-                   polynomial form -> index form  index_of[j=alpha**i] = i
-   alpha=2 is the primitive element of GF(2**m)
-   HARI's COMMENT: (4/13/94) alpha_to[] can be used as follows:
-        Let @ represent the primitive element commonly called "alpha" that
-   is the root of the primitive polynomial p(x). Then in GF(2^m), for any
-   0 <= i <= 2^m-2,
-        @^i = a(0) + a(1) @ + a(2) @^2 + ... + a(m-1) @^(m-1)
-   where the binary vector (a(0),a(1),a(2),...,a(m-1)) is the representation
-   of the integer "alpha_to[i]" with a(0) being the LSB and a(m-1) the MSB. Thus for
-   example the polynomial representation of @^5 would be given by the binary
-   representation of the integer "alpha_to[5]".
-                   Similarly, index_of[] can be used as follows:
-        As above, let @ represent the primitive element of GF(2^m) that is
-   the root of the primitive polynomial p(x). In order to find the power
-   of @ (alpha) that has the polynomial representation
-        a(0) + a(1) @ + a(2) @^2 + ... + a(m-1) @^(m-1)
-   we consider the integer "i" whose binary representation with a(0) being LSB
-   and a(m-1) MSB is (a(0),a(1),...,a(m-1)) and locate the entry
-   "index_of[i]". Now, @^index_of[i] is that element whose polynomial
-    representation is (a(0),a(1),a(2),...,a(m-1)).
-   NOTE:
-        The element alpha_to[2^m-1] = 0 always signifying that the
-   representation of "@^infinity" = 0 is (0,0,0,...,0).
-        Similarly, the element index_of[0] = A0 always signifying
-   that the power of alpha which has the polynomial representation
-   (0,0,...,0) is "infinity".
-
-*/
-
-static void
-generate_gf(dtype Alpha_to[NN + 1], dtype Index_of[NN + 1])
-{
-  register int i, mask;
-
-  mask = 1;
-  Alpha_to[MM] = 0;
-  for (i = 0; i < MM; i++) {
-    Alpha_to[i] = mask;
-    Index_of[Alpha_to[i]] = i;
-    /* If Pp[i] == 1 then, term @^i occurs in poly-repr of @^MM */
-    if (Pp[i] != 0)
-      Alpha_to[MM] ^= mask;    /* Bit-wise EXOR operation */
-    mask <<= 1;        /* single left-shift */
-  }
-  Index_of[Alpha_to[MM]] = MM;
-  /*
-   * Have obtained poly-repr of @^MM. Poly-repr of @^(i+1) is given by
-   * poly-repr of @^i shifted left one-bit and accounting for any @^MM
-   * term that may occur when poly-repr of @^i is shifted.
-   */
-  mask >>= 1;
-  for (i = MM + 1; i < NN; i++) {
-    if (Alpha_to[i - 1] >= mask)
-      Alpha_to[i] = Alpha_to[MM] ^ ((Alpha_to[i - 1] ^ mask) << 1);
-    else
-      Alpha_to[i] = Alpha_to[i - 1] << 1;
-    Index_of[Alpha_to[i]] = i;
-  }
-  Index_of[0] = A0;
-  Alpha_to[NN] = 0;
-}
-
-/*
- * Performs ERRORS+ERASURES decoding of RS codes. bb[] is the content
- * of the feedback shift register after having processed the data and
- * the ECC.
- *
- * Return number of symbols corrected, or -1 if codeword is illegal
- * or uncorrectable. If eras_pos is non-null, the detected error locations
- * are written back. NOTE! This array must be at least NN-KK elements long.
- * The corrected data are written in eras_val[]. They must be xor with the data
- * to retrieve the correct data : data[erase_pos[i]] ^= erase_val[i] .
- *
- * First "no_eras" erasures are declared by the calling program. Then, the
- * maximum # of errors correctable is t_after_eras = floor((NN-KK-no_eras)/2).
- * If the number of channel errors is not greater than "t_after_eras" the
- * transmitted codeword will be recovered. Details of algorithm can be found
- * in R. Blahut's "Theory ... of Error-Correcting Codes".
-
- * Warning: the eras_pos[] array must not contain duplicate entries; decoder failure
- * will result. The decoder *could* check for this condition, but it would involve
- * extra time on every decoding operation.
- * */
-static int
-eras_dec_rs(dtype Alpha_to[NN + 1], dtype Index_of[NN + 1],
-            gf bb[NN - KK + 1], gf eras_val[NN-KK], int eras_pos[NN-KK],
-            int no_eras)
-{
-  int deg_lambda, el, deg_omega;
-  int i, j, r,k;
-  gf u,q,tmp,num1,num2,den,discr_r;
-  gf lambda[NN-KK + 1], s[NN-KK + 1];  /* Err+Eras Locator poly
-                                        * and syndrome poly */
-  gf b[NN-KK + 1], t[NN-KK + 1], omega[NN-KK + 1];
-  gf root[NN-KK], reg[NN-KK + 1], loc[NN-KK];
-  int syn_error, count;
-
-  syn_error = 0;
-  for(i=0;i<NN-KK;i++)
-      syn_error |= bb[i];
-
-  if (!syn_error) {
-    /* if remainder is zero, data[] is a codeword and there are no
-     * errors to correct. So return data[] unmodified
-     */
-    count = 0;
-    goto finish;
-  }
-
-  for(i=1;i<=NN-KK;i++){
-    s[i] = bb[0];
-  }
-  for(j=1;j<NN-KK;j++){
-    if(bb[j] == 0)
-      continue;
-    tmp = Index_of[bb[j]];
-
-    for(i=1;i<=NN-KK;i++)
-      s[i] ^= Alpha_to[modnn(tmp + (B0+i-1)*PRIM*j)];
-  }
-
-  /* undo the feedback register implicit multiplication and convert
-     syndromes to index form */
-
-  for(i=1;i<=NN-KK;i++) {
-      tmp = Index_of[s[i]];
-      if (tmp != A0)
-          tmp = modnn(tmp + 2 * KK * (B0+i-1)*PRIM);
-      s[i] = tmp;
-  }
-
-  CLEAR(&lambda[1],NN-KK);
-  lambda[0] = 1;
-
-  if (no_eras > 0) {
-    /* Init lambda to be the erasure locator polynomial */
-    lambda[1] = Alpha_to[modnn(PRIM * eras_pos[0])];
-    for (i = 1; i < no_eras; i++) {
-      u = modnn(PRIM*eras_pos[i]);
-      for (j = i+1; j > 0; j--) {
-       tmp = Index_of[lambda[j - 1]];
-       if(tmp != A0)
-         lambda[j] ^= Alpha_to[modnn(u + tmp)];
-      }
-    }
-#if DEBUG_ECC >= 1
-    /* Test code that verifies the erasure locator polynomial just constructed
-       Needed only for decoder debugging. */
-
-    /* find roots of the erasure location polynomial */
-    for(i=1;i<=no_eras;i++)
-      reg[i] = Index_of[lambda[i]];
-    count = 0;
-    for (i = 1,k=NN-Ldec; i <= NN; i++,k = modnn(NN+k-Ldec)) {
-      q = 1;
-      for (j = 1; j <= no_eras; j++)
-       if (reg[j] != A0) {
-         reg[j] = modnn(reg[j] + j);
-         q ^= Alpha_to[reg[j]];
-       }
-      if (q != 0)
-       continue;
-      /* store root and error location number indices */
-      root[count] = i;
-      loc[count] = k;
-      count++;
-    }
-    if (count != no_eras) {
-      printf("\n lambda(x) is WRONG\n");
-      count = -1;
-      goto finish;
-    }
-#if DEBUG_ECC >= 2
-    printf("\n Erasure positions as determined by roots of Eras Loc Poly:\n");
-    for (i = 0; i < count; i++)
-      printf("%d ", loc[i]);
-    printf("\n");
-#endif
-#endif
-  }
-  for(i=0;i<NN-KK+1;i++)
-    b[i] = Index_of[lambda[i]];
-
-  /*
-   * Begin Berlekamp-Massey algorithm to determine error+erasure
-   * locator polynomial
-   */
-  r = no_eras;
-  el = no_eras;
-  while (++r <= NN-KK) {       /* r is the step number */
-    /* Compute discrepancy at the r-th step in poly-form */
-    discr_r = 0;
-    for (i = 0; i < r; i++){
-      if ((lambda[i] != 0) && (s[r - i] != A0)) {
-       discr_r ^= Alpha_to[modnn(Index_of[lambda[i]] + s[r - i])];
-      }
-    }
-    discr_r = Index_of[discr_r];       /* Index form */
-    if (discr_r == A0) {
-      /* 2 lines below: B(x) <-- x*B(x) */
-      COPYDOWN(&b[1],b,NN-KK);
-      b[0] = A0;
-    } else {
-      /* 7 lines below: T(x) <-- lambda(x) - discr_r*x*b(x) */
-      t[0] = lambda[0];
-      for (i = 0 ; i < NN-KK; i++) {
-       if(b[i] != A0)
-         t[i+1] = lambda[i+1] ^ Alpha_to[modnn(discr_r + b[i])];
-       else
-         t[i+1] = lambda[i+1];
-      }
-      if (2 * el <= r + no_eras - 1) {
-       el = r + no_eras - el;
-       /*
-        * 2 lines below: B(x) <-- inv(discr_r) *
-        * lambda(x)
-        */
-       for (i = 0; i <= NN-KK; i++)
-         b[i] = (lambda[i] == 0) ? A0 : modnn(Index_of[lambda[i]] - discr_r + NN);
-      } else {
-       /* 2 lines below: B(x) <-- x*B(x) */
-       COPYDOWN(&b[1],b,NN-KK);
-       b[0] = A0;
-      }
-      COPY(lambda,t,NN-KK+1);
-    }
-  }
-
-  /* Convert lambda to index form and compute deg(lambda(x)) */
-  deg_lambda = 0;
-  for(i=0;i<NN-KK+1;i++){
-    lambda[i] = Index_of[lambda[i]];
-    if(lambda[i] != A0)
-      deg_lambda = i;
-  }
-  /*
-   * Find roots of the error+erasure locator polynomial by Chien
-   * Search
-   */
-  COPY(&reg[1],&lambda[1],NN-KK);
-  count = 0;           /* Number of roots of lambda(x) */
-  for (i = 1,k=NN-Ldec; i <= NN; i++,k = modnn(NN+k-Ldec)) {
-    q = 1;
-    for (j = deg_lambda; j > 0; j--){
-      if (reg[j] != A0) {
-       reg[j] = modnn(reg[j] + j);
-       q ^= Alpha_to[reg[j]];
-      }
-    }
-    if (q != 0)
-      continue;
-    /* store root (index-form) and error location number */
-    root[count] = i;
-    loc[count] = k;
-    /* If we've already found max possible roots,
-     * abort the search to save time
-     */
-    if(++count == deg_lambda)
-      break;
-  }
-  if (deg_lambda != count) {
-    /*
-     * deg(lambda) unequal to number of roots => uncorrectable
-     * error detected
-     */
-    count = -1;
-    goto finish;
-  }
-  /*
-   * Compute err+eras evaluator poly omega(x) = s(x)*lambda(x) (modulo
-   * x**(NN-KK)). in index form. Also find deg(omega).
-   */
-  deg_omega = 0;
-  for (i = 0; i < NN-KK;i++){
-    tmp = 0;
-    j = (deg_lambda < i) ? deg_lambda : i;
-    for(;j >= 0; j--){
-      if ((s[i + 1 - j] != A0) && (lambda[j] != A0))
-       tmp ^= Alpha_to[modnn(s[i + 1 - j] + lambda[j])];
-    }
-    if(tmp != 0)
-      deg_omega = i;
-    omega[i] = Index_of[tmp];
-  }
-  omega[NN-KK] = A0;
-
-  /*
-   * Compute error values in poly-form. num1 = omega(inv(X(l))), num2 =
-   * inv(X(l))**(B0-1) and den = lambda_pr(inv(X(l))) all in poly-form
-   */
-  for (j = count-1; j >=0; j--) {
-    num1 = 0;
-    for (i = deg_omega; i >= 0; i--) {
-      if (omega[i] != A0)
-       num1  ^= Alpha_to[modnn(omega[i] + i * root[j])];
-    }
-    num2 = Alpha_to[modnn(root[j] * (B0 - 1) + NN)];
-    den = 0;
-
-    /* lambda[i+1] for i even is the formal derivative lambda_pr of lambda[i] */
-    for (i = min(deg_lambda,NN-KK-1) & ~1; i >= 0; i -=2) {
-      if(lambda[i+1] != A0)
-       den ^= Alpha_to[modnn(lambda[i+1] + i * root[j])];
-    }
-    if (den == 0) {
-#if DEBUG_ECC >= 1
-      printf("\n ERROR: denominator = 0\n");
-#endif
-      /* Convert to dual- basis */
-      count = -1;
-      goto finish;
-    }
-    /* Apply error to data */
-    if (num1 != 0) {
-        eras_val[j] = Alpha_to[modnn(Index_of[num1] + Index_of[num2] + NN - Index_of[den])];
-    } else {
-        eras_val[j] = 0;
-    }
-  }
- finish:
-  for(i=0;i<count;i++)
-      eras_pos[i] = loc[i];
-  return count;
-}
-
-/***************************************************************************/
-/* The DOC specific code begins here */
-
-#define SECTOR_SIZE 512
-/* The sector bytes are packed into NB_DATA MM bits words */
-#define NB_DATA (((SECTOR_SIZE + 1) * 8 + 6) / MM)
-
-/*
- * Correct the errors in 'sector[]' by using 'ecc1[]' which is the
- * content of the feedback shift register applyied to the sector and
- * the ECC. Return the number of errors corrected (and correct them in
- * sector), or -1 if error
- */
-int doc_decode_ecc(unsigned char sector[SECTOR_SIZE], unsigned char ecc1[6])
-{
-    int parity, i, nb_errors;
-    gf bb[NN - KK + 1];
-    gf error_val[NN-KK];
-    int error_pos[NN-KK], pos, bitpos, index, val;
-    dtype *Alpha_to, *Index_of;
-
-    /* init log and exp tables here to save memory. However, it is slower */
-    Alpha_to = kmalloc((NN + 1) * sizeof(dtype), GFP_KERNEL);
-    if (!Alpha_to)
-        return -1;
-
-    Index_of = kmalloc((NN + 1) * sizeof(dtype), GFP_KERNEL);
-    if (!Index_of) {
-        kfree(Alpha_to);
-        return -1;
-    }
-
-    generate_gf(Alpha_to, Index_of);
-
-    parity = ecc1[1];
-
-    bb[0] =  (ecc1[4] & 0xff) | ((ecc1[5] & 0x03) << 8);
-    bb[1] = ((ecc1[5] & 0xfc) >> 2) | ((ecc1[2] & 0x0f) << 6);
-    bb[2] = ((ecc1[2] & 0xf0) >> 4) | ((ecc1[3] & 0x3f) << 4);
-    bb[3] = ((ecc1[3] & 0xc0) >> 6) | ((ecc1[0] & 0xff) << 2);
-
-    nb_errors = eras_dec_rs(Alpha_to, Index_of, bb,
-                            error_val, error_pos, 0);
-    if (nb_errors <= 0)
-        goto the_end;
-
-    /* correct the errors */
-    for(i=0;i<nb_errors;i++) {
-        pos = error_pos[i];
-        if (pos >= NB_DATA && pos < KK) {
-            nb_errors = -1;
-            goto the_end;
-        }
-        if (pos < NB_DATA) {
-            /* extract bit position (MSB first) */
-            pos = 10 * (NB_DATA - 1 - pos) - 6;
-            /* now correct the following 10 bits. At most two bytes
-               can be modified since pos is even */
-            index = (pos >> 3) ^ 1;
-            bitpos = pos & 7;
-            if ((index >= 0 && index < SECTOR_SIZE) ||
-                index == (SECTOR_SIZE + 1)) {
-                val = error_val[i] >> (2 + bitpos);
-                parity ^= val;
-                if (index < SECTOR_SIZE)
-                    sector[index] ^= val;
-            }
-            index = ((pos >> 3) + 1) ^ 1;
-            bitpos = (bitpos + 10) & 7;
-            if (bitpos == 0)
-                bitpos = 8;
-            if ((index >= 0 && index < SECTOR_SIZE) ||
-                index == (SECTOR_SIZE + 1)) {
-                val = error_val[i] << (8 - bitpos);
-                parity ^= val;
-                if (index < SECTOR_SIZE)
-                    sector[index] ^= val;
-            }
-        }
-    }
-
-    /* use parity to test extra errors */
-    if ((parity & 0xff) != 0)
-        nb_errors = -1;
-
- the_end:
-    kfree(Alpha_to);
-    kfree(Index_of);
-    return nb_errors;
-}
-
-EXPORT_SYMBOL_GPL(doc_decode_ecc);
-
-MODULE_LICENSE("GPL");
-MODULE_AUTHOR("Fabrice Bellard <fabrice.bellard@netgem.com>");
-MODULE_DESCRIPTION("ECC code for correcting errors detected by DiskOnChip 2000 and Millennium ECC hardware");
diff --git a/drivers/mtd/devices/docg3.c b/drivers/mtd/devices/docg3.c

index 8510ccb9c6f00a450ed58cea115dd33d8070f749..3e1b0a0ef4dba22e559b74e603ebe1bf90f389cb 100644 (file)
--- a/drivers/mtd/devices/docg3.c
+++ b/drivers/mtd/devices/docg3.c
@@ -123,7 +123,7 @@ static inline void doc_flash_address(struct docg3 *docg3, u8 addr)
         doc_writeb(docg3, addr, DOC_FLASHADDRESS);
  }
  
-static char const *part_probes[] = { "cmdlinepart", "saftlpart", NULL };
+static char const * const part_probes[] = { "cmdlinepart", "saftlpart", NULL };
  
  static int doc_register_readb(struct docg3 *docg3, int reg)
  {
@@ -2144,18 +2144,7 @@ static struct platform_driver g3_driver = {
         .remove         = __exit_p(docg3_release),
  };
  
-static int __init docg3_init(void)
-{
-       return platform_driver_probe(&g3_driver, docg3_probe);
-}
-module_init(docg3_init);
-
-
-static void __exit docg3_exit(void)
-{
-       platform_driver_unregister(&g3_driver);
-}
-module_exit(docg3_exit);
+module_platform_driver_probe(g3_driver, docg3_probe);
  
  MODULE_LICENSE("GPL");
  MODULE_AUTHOR("Robert Jarzmik <robert.jarzmik@free.fr>");
diff --git a/drivers/mtd/devices/docprobe.c b/drivers/mtd/devices/docprobe.c

deleted file mode 100644 (file)

index 88b3fd3..0000000
--- a/drivers/mtd/devices/docprobe.c
+++ /dev/null
@@ -1,325 +0,0 @@
-
-/* Linux driver for Disk-On-Chip devices                       */
-/* Probe routines common to all DoC devices                    */
-/* (C) 1999 Machine Vision Holdings, Inc.                      */
-/* (C) 1999-2003 David Woodhouse <dwmw2@infradead.org>         */
-
-
-/* DOC_PASSIVE_PROBE:
-   In order to ensure that the BIOS checksum is correct at boot time, and
-   hence that the onboard BIOS extension gets executed, the DiskOnChip
-   goes into reset mode when it is read sequentially: all registers
-   return 0xff until the chip is woken up again by writing to the
-   DOCControl register.
-
-   Unfortunately, this means that the probe for the DiskOnChip is unsafe,
-   because one of the first things it does is write to where it thinks
-   the DOCControl register should be - which may well be shared memory
-   for another device. I've had machines which lock up when this is
-   attempted. Hence the possibility to do a passive probe, which will fail
-   to detect a chip in reset mode, but is at least guaranteed not to lock
-   the machine.
-
-   If you have this problem, uncomment the following line:
-#define DOC_PASSIVE_PROBE
-*/
-
-
-/* DOC_SINGLE_DRIVER:
-   Millennium driver has been merged into DOC2000 driver.
-
-   The old Millennium-only driver has been retained just in case there
-   are problems with the new code. If the combined driver doesn't work
-   for you, you can try the old one by undefining DOC_SINGLE_DRIVER
-   below and also enabling it in your configuration. If this fixes the
-   problems, please send a report to the MTD mailing list at
-   <linux-mtd@lists.infradead.org>.
-*/
-#define DOC_SINGLE_DRIVER
-
-#include <linux/kernel.h>
-#include <linux/module.h>
-#include <asm/errno.h>
-#include <asm/io.h>
-#include <linux/delay.h>
-#include <linux/slab.h>
-#include <linux/init.h>
-#include <linux/types.h>
-
-#include <linux/mtd/mtd.h>
-#include <linux/mtd/nand.h>
-#include <linux/mtd/doc2000.h>
-
-
-static unsigned long doc_config_location = CONFIG_MTD_DOCPROBE_ADDRESS;
-module_param(doc_config_location, ulong, 0);
-MODULE_PARM_DESC(doc_config_location, "Physical memory address at which to probe for DiskOnChip");
-
-static unsigned long __initdata doc_locations[] = {
-#if defined (__alpha__) || defined(__i386__) || defined(__x86_64__)
-#ifdef CONFIG_MTD_DOCPROBE_HIGH
-       0xfffc8000, 0xfffca000, 0xfffcc000, 0xfffce000,
-       0xfffd0000, 0xfffd2000, 0xfffd4000, 0xfffd6000,
-       0xfffd8000, 0xfffda000, 0xfffdc000, 0xfffde000,
-       0xfffe0000, 0xfffe2000, 0xfffe4000, 0xfffe6000,
-       0xfffe8000, 0xfffea000, 0xfffec000, 0xfffee000,
-#else /*  CONFIG_MTD_DOCPROBE_HIGH */
-       0xc8000, 0xca000, 0xcc000, 0xce000,
-       0xd0000, 0xd2000, 0xd4000, 0xd6000,
-       0xd8000, 0xda000, 0xdc000, 0xde000,
-       0xe0000, 0xe2000, 0xe4000, 0xe6000,
-       0xe8000, 0xea000, 0xec000, 0xee000,
-#endif /*  CONFIG_MTD_DOCPROBE_HIGH */
-#endif
-       0xffffffff };
-
-/* doccheck: Probe a given memory window to see if there's a DiskOnChip present */
-
-static inline int __init doccheck(void __iomem *potential, unsigned long physadr)
-{
-       void __iomem *window=potential;
-       unsigned char tmp, tmpb, tmpc, ChipID;
-#ifndef DOC_PASSIVE_PROBE
-       unsigned char tmp2;
-#endif
-
-       /* Routine copied from the Linux DOC driver */
-
-#ifdef CONFIG_MTD_DOCPROBE_55AA
-       /* Check for 0x55 0xAA signature at beginning of window,
-          this is no longer true once we remove the IPL (for Millennium */
-       if (ReadDOC(window, Sig1) != 0x55 || ReadDOC(window, Sig2) != 0xaa)
-               return 0;
-#endif /* CONFIG_MTD_DOCPROBE_55AA */
-
-#ifndef DOC_PASSIVE_PROBE
-       /* It's not possible to cleanly detect the DiskOnChip - the
-        * bootup procedure will put the device into reset mode, and
-        * it's not possible to talk to it without actually writing
-        * to the DOCControl register. So we store the current contents
-        * of the DOCControl register's location, in case we later decide
-        * that it's not a DiskOnChip, and want to put it back how we
-        * found it.
-        */
-       tmp2 = ReadDOC(window, DOCControl);
-
-       /* Reset the DiskOnChip ASIC */
-       WriteDOC(DOC_MODE_CLR_ERR | DOC_MODE_MDWREN | DOC_MODE_RESET,
-                window, DOCControl);
-       WriteDOC(DOC_MODE_CLR_ERR | DOC_MODE_MDWREN | DOC_MODE_RESET,
-                window, DOCControl);
-
-       /* Enable the DiskOnChip ASIC */
-       WriteDOC(DOC_MODE_CLR_ERR | DOC_MODE_MDWREN | DOC_MODE_NORMAL,
-                window, DOCControl);
-       WriteDOC(DOC_MODE_CLR_ERR | DOC_MODE_MDWREN | DOC_MODE_NORMAL,
-                window, DOCControl);
-#endif /* !DOC_PASSIVE_PROBE */
-
-       /* We need to read the ChipID register four times. For some
-          newer DiskOnChip 2000 units, the first three reads will
-          return the DiskOnChip Millennium ident. Don't ask. */
-       ChipID = ReadDOC(window, ChipID);
-
-       switch (ChipID) {
-       case DOC_ChipID_Doc2k:
-               /* Check the TOGGLE bit in the ECC register */
-               tmp  = ReadDOC(window, 2k_ECCStatus) & DOC_TOGGLE_BIT;
-               tmpb = ReadDOC(window, 2k_ECCStatus) & DOC_TOGGLE_BIT;
-               tmpc = ReadDOC(window, 2k_ECCStatus) & DOC_TOGGLE_BIT;
-               if (tmp != tmpb && tmp == tmpc)
-                               return ChipID;
-               break;
-
-       case DOC_ChipID_DocMil:
-               /* Check for the new 2000 with Millennium ASIC */
-               ReadDOC(window, ChipID);
-               ReadDOC(window, ChipID);
-               if (ReadDOC(window, ChipID) != DOC_ChipID_DocMil)
-                       ChipID = DOC_ChipID_Doc2kTSOP;
-
-               /* Check the TOGGLE bit in the ECC register */
-               tmp  = ReadDOC(window, ECCConf) & DOC_TOGGLE_BIT;
-               tmpb = ReadDOC(window, ECCConf) & DOC_TOGGLE_BIT;
-               tmpc = ReadDOC(window, ECCConf) & DOC_TOGGLE_BIT;
-               if (tmp != tmpb && tmp == tmpc)
-                               return ChipID;
-               break;
-
-       case DOC_ChipID_DocMilPlus16:
-       case DOC_ChipID_DocMilPlus32:
-       case 0:
-               /* Possible Millennium+, need to do more checks */
-#ifndef DOC_PASSIVE_PROBE
-               /* Possibly release from power down mode */
-               for (tmp = 0; (tmp < 4); tmp++)
-                       ReadDOC(window, Mplus_Power);
-
-               /* Reset the DiskOnChip ASIC */
-               tmp = DOC_MODE_RESET | DOC_MODE_MDWREN | DOC_MODE_RST_LAT |
-                       DOC_MODE_BDECT;
-               WriteDOC(tmp, window, Mplus_DOCControl);
-               WriteDOC(~tmp, window, Mplus_CtrlConfirm);
-
-               mdelay(1);
-               /* Enable the DiskOnChip ASIC */
-               tmp = DOC_MODE_NORMAL | DOC_MODE_MDWREN | DOC_MODE_RST_LAT |
-                       DOC_MODE_BDECT;
-               WriteDOC(tmp, window, Mplus_DOCControl);
-               WriteDOC(~tmp, window, Mplus_CtrlConfirm);
-               mdelay(1);
-#endif /* !DOC_PASSIVE_PROBE */
-
-               ChipID = ReadDOC(window, ChipID);
-
-               switch (ChipID) {
-               case DOC_ChipID_DocMilPlus16:
-               case DOC_ChipID_DocMilPlus32:
-                       /* Check the TOGGLE bit in the toggle register */
-                       tmp  = ReadDOC(window, Mplus_Toggle) & DOC_TOGGLE_BIT;
-                       tmpb = ReadDOC(window, Mplus_Toggle) & DOC_TOGGLE_BIT;
-                       tmpc = ReadDOC(window, Mplus_Toggle) & DOC_TOGGLE_BIT;
-                       if (tmp != tmpb && tmp == tmpc)
-                                       return ChipID;
-               default:
-                       break;
-               }
-               /* FALL TRHU */
-
-       default:
-
-#ifdef CONFIG_MTD_DOCPROBE_55AA
-               printk(KERN_DEBUG "Possible DiskOnChip with unknown ChipID %2.2X found at 0x%lx\n",
-                      ChipID, physadr);
-#endif
-#ifndef DOC_PASSIVE_PROBE
-               /* Put back the contents of the DOCControl register, in case it's not
-                * actually a DiskOnChip.
-                */
-               WriteDOC(tmp2, window, DOCControl);
-#endif
-               return 0;
-       }
-
-       printk(KERN_WARNING "DiskOnChip failed TOGGLE test, dropping.\n");
-
-#ifndef DOC_PASSIVE_PROBE
-       /* Put back the contents of the DOCControl register: it's not a DiskOnChip */
-       WriteDOC(tmp2, window, DOCControl);
-#endif
-       return 0;
-}
-
-static int docfound;
-
-extern void DoC2k_init(struct mtd_info *);
-extern void DoCMil_init(struct mtd_info *);
-extern void DoCMilPlus_init(struct mtd_info *);
-
-static void __init DoC_Probe(unsigned long physadr)
-{
-       void __iomem *docptr;
-       struct DiskOnChip *this;
-       struct mtd_info *mtd;
-       int ChipID;
-       char namebuf[15];
-       char *name = namebuf;
-       void (*initroutine)(struct mtd_info *) = NULL;
-
-       docptr = ioremap(physadr, DOC_IOREMAP_LEN);
-
-       if (!docptr)
-               return;
-
-       if ((ChipID = doccheck(docptr, physadr))) {
-               if (ChipID == DOC_ChipID_Doc2kTSOP) {
-                       /* Remove this at your own peril. The hardware driver works but nothing prevents you from erasing bad blocks */
-                       printk(KERN_NOTICE "Refusing to drive DiskOnChip 2000 TSOP until Bad Block Table is correctly supported by INFTL\n");
-                       iounmap(docptr);
-                       return;
-               }
-               docfound = 1;
-               mtd = kzalloc(sizeof(struct DiskOnChip) + sizeof(struct mtd_info), GFP_KERNEL);
-               if (!mtd) {
-                       printk(KERN_WARNING "Cannot allocate memory for data structures. Dropping.\n");
-                       iounmap(docptr);
-                       return;
-               }
-
-               this = (struct DiskOnChip *)(&mtd[1]);
-               mtd->priv = this;
-               this->virtadr = docptr;
-               this->physadr = physadr;
-               this->ChipID = ChipID;
-               sprintf(namebuf, "with ChipID %2.2X", ChipID);
-
-               switch(ChipID) {
-               case DOC_ChipID_Doc2kTSOP:
-                       name="2000 TSOP";
-                       initroutine = symbol_request(DoC2k_init);
-                       break;
-
-               case DOC_ChipID_Doc2k:
-                       name="2000";
-                       initroutine = symbol_request(DoC2k_init);
-                       break;
-
-               case DOC_ChipID_DocMil:
-                       name="Millennium";
-#ifdef DOC_SINGLE_DRIVER
-                       initroutine = symbol_request(DoC2k_init);
-#else
-                       initroutine = symbol_request(DoCMil_init);
-#endif /* DOC_SINGLE_DRIVER */
-                       break;
-
-               case DOC_ChipID_DocMilPlus16:
-               case DOC_ChipID_DocMilPlus32:
-                       name="MillenniumPlus";
-                       initroutine = symbol_request(DoCMilPlus_init);
-                       break;
-               }
-
-               if (initroutine) {
-                       (*initroutine)(mtd);
-                       symbol_put_addr(initroutine);
-                       return;
-               }
-               printk(KERN_NOTICE "Cannot find driver for DiskOnChip %s at 0x%lX\n", name, physadr);
-               kfree(mtd);
-       }
-       iounmap(docptr);
-}
-
-
-/****************************************************************************
- *
- * Module stuff
- *
- ****************************************************************************/
-
-static int __init init_doc(void)
-{
-       int i;
-
-       if (doc_config_location) {
-               printk(KERN_INFO "Using configured DiskOnChip probe address 0x%lx\n", doc_config_location);
-               DoC_Probe(doc_config_location);
-       } else {
-               for (i=0; (doc_locations[i] != 0xffffffff); i++) {
-                       DoC_Probe(doc_locations[i]);
-               }
-       }
-       /* No banner message any more. Print a message if no DiskOnChip
-          found, so the user knows we at least tried. */
-       if (!docfound)
-               printk(KERN_INFO "No recognised DiskOnChip devices found\n");
-       return -EAGAIN;
-}
-
-module_init(init_doc);
-
-MODULE_LICENSE("GPL");
-MODULE_AUTHOR("David Woodhouse <dwmw2@infradead.org>");
-MODULE_DESCRIPTION("Probe code for DiskOnChip 2000 and Millennium devices");
-
diff --git a/drivers/mtd/devices/elm.c b/drivers/mtd/devices/elm.c

index 2ec5da9ee24826a0790399e24cb6271891889cf8..dccef9fdc1f276269566bcc4ba03ae6e7a87111d 100644 (file)
--- a/drivers/mtd/devices/elm.c
+++ b/drivers/mtd/devices/elm.c
@@ -81,14 +81,21 @@ static u32 elm_read_reg(struct elm_info *info, int offset)
   * @dev:       ELM device
   * @bch_type:  Type of BCH ecc
   */
-void elm_config(struct device *dev, enum bch_ecc bch_type)
+int elm_config(struct device *dev, enum bch_ecc bch_type)
  {
         u32 reg_val;
         struct elm_info *info = dev_get_drvdata(dev);
  
+       if (!info) {
+               dev_err(dev, "Unable to configure elm - device not probed?\n");
+               return -ENODEV;
+       }
+
         reg_val = (bch_type & ECC_BCH_LEVEL_MASK) | (ELM_ECC_SIZE << 16);
         elm_write_reg(info, ELM_LOCATION_CONFIG, reg_val);
         info->bch_type = bch_type;
+
+       return 0;
  }
  EXPORT_SYMBOL(elm_config);
  
diff --git a/drivers/mtd/devices/m25p80.c b/drivers/mtd/devices/m25p80.c

index 5b6b0728be21b0f4c3a7ddee475f3d02ce8f3d52..2f3d2a5ff349a174dc0f473f07c67f477f0139c4 100644 (file)
--- a/drivers/mtd/devices/m25p80.c
+++ b/drivers/mtd/devices/m25p80.c
@@ -681,6 +681,7 @@ struct flash_info {
         u16             flags;
  #define        SECT_4K         0x01            /* OPCODE_BE_4K works uniformly */
  #define        M25P_NO_ERASE   0x02            /* No erase command needed */
+#define        SST_WRITE       0x04            /* use SST byte programming */
  };
  
  #define INFO(_jedec_id, _ext_id, _sector_size, _n_sectors, _flags)     \
@@ -728,6 +729,7 @@ static const struct spi_device_id m25p_ids[] = {
         { "en25q32b", INFO(0x1c3016, 0, 64 * 1024,  64, 0) },
         { "en25p64", INFO(0x1c2017, 0, 64 * 1024, 128, 0) },
         { "en25q64", INFO(0x1c3017, 0, 64 * 1024, 128, SECT_4K) },
+       { "en25qh256", INFO(0x1c7019, 0, 64 * 1024, 512, 0) },
  
         /* Everspin */
         { "mr25h256", CAT25_INFO(  32 * 1024, 1, 256, 2) },
@@ -740,7 +742,6 @@ static const struct spi_device_id m25p_ids[] = {
         { "160s33b",  INFO(0x898911, 0, 64 * 1024,  32, 0) },
         { "320s33b",  INFO(0x898912, 0, 64 * 1024,  64, 0) },
         { "640s33b",  INFO(0x898913, 0, 64 * 1024, 128, 0) },
-       { "n25q064",  INFO(0x20ba17, 0, 64 * 1024, 128, 0) },
  
         /* Macronix */
         { "mx25l2005a",  INFO(0xc22012, 0, 64 * 1024,   4, SECT_4K) },
@@ -753,8 +754,10 @@ static const struct spi_device_id m25p_ids[] = {
         { "mx25l12855e", INFO(0xc22618, 0, 64 * 1024, 256, 0) },
         { "mx25l25635e", INFO(0xc22019, 0, 64 * 1024, 512, 0) },
         { "mx25l25655e", INFO(0xc22619, 0, 64 * 1024, 512, 0) },
+       { "mx66l51235l", INFO(0xc2201a, 0, 64 * 1024, 1024, 0) },
  
         /* Micron */
+       { "n25q064",  INFO(0x20ba17, 0, 64 * 1024, 128, 0) },
         { "n25q128a11",  INFO(0x20bb18, 0, 64 * 1024, 256, 0) },
         { "n25q128a13",  INFO(0x20ba18, 0, 64 * 1024, 256, 0) },
         { "n25q256a", INFO(0x20ba19, 0, 64 * 1024, 512, SECT_4K) },
@@ -781,14 +784,15 @@ static const struct spi_device_id m25p_ids[] = {
         { "s25fl064k",  INFO(0xef4017,      0,  64 * 1024, 128, SECT_4K) },
  
         /* SST -- large erase sizes are "overlays", "sectors" are 4K */
-       { "sst25vf040b", INFO(0xbf258d, 0, 64 * 1024,  8, SECT_4K) },
-       { "sst25vf080b", INFO(0xbf258e, 0, 64 * 1024, 16, SECT_4K) },
-       { "sst25vf016b", INFO(0xbf2541, 0, 64 * 1024, 32, SECT_4K) },
-       { "sst25vf032b", INFO(0xbf254a, 0, 64 * 1024, 64, SECT_4K) },
-       { "sst25wf512",  INFO(0xbf2501, 0, 64 * 1024,  1, SECT_4K) },
-       { "sst25wf010",  INFO(0xbf2502, 0, 64 * 1024,  2, SECT_4K) },
-       { "sst25wf020",  INFO(0xbf2503, 0, 64 * 1024,  4, SECT_4K) },
-       { "sst25wf040",  INFO(0xbf2504, 0, 64 * 1024,  8, SECT_4K) },
+       { "sst25vf040b", INFO(0xbf258d, 0, 64 * 1024,  8, SECT_4K | SST_WRITE) },
+       { "sst25vf080b", INFO(0xbf258e, 0, 64 * 1024, 16, SECT_4K | SST_WRITE) },
+       { "sst25vf016b", INFO(0xbf2541, 0, 64 * 1024, 32, SECT_4K | SST_WRITE) },
+       { "sst25vf032b", INFO(0xbf254a, 0, 64 * 1024, 64, SECT_4K | SST_WRITE) },
+       { "sst25vf064c", INFO(0xbf254b, 0, 64 * 1024, 128, SECT_4K) },
+       { "sst25wf512",  INFO(0xbf2501, 0, 64 * 1024,  1, SECT_4K | SST_WRITE) },
+       { "sst25wf010",  INFO(0xbf2502, 0, 64 * 1024,  2, SECT_4K | SST_WRITE) },
+       { "sst25wf020",  INFO(0xbf2503, 0, 64 * 1024,  4, SECT_4K | SST_WRITE) },
+       { "sst25wf040",  INFO(0xbf2504, 0, 64 * 1024,  8, SECT_4K | SST_WRITE) },
  
         /* ST Microelectronics -- newer production may have feature updates */
         { "m25p05",  INFO(0x202010,  0,  32 * 1024,   2, 0) },
@@ -838,6 +842,7 @@ static const struct spi_device_id m25p_ids[] = {
         { "w25q64", INFO(0xef4017, 0, 64 * 1024, 128, SECT_4K) },
         { "w25q80", INFO(0xef5014, 0, 64 * 1024,  16, SECT_4K) },
         { "w25q80bl", INFO(0xef4014, 0, 64 * 1024,  16, SECT_4K) },
+       { "w25q128", INFO(0xef4018, 0, 64 * 1024, 256, SECT_4K) },
         { "w25q256", INFO(0xef4019, 0, 64 * 1024, 512, SECT_4K) },
  
         /* Catalyst / On Semiconductor -- non-JEDEC */
@@ -1000,7 +1005,7 @@ static int m25p_probe(struct spi_device *spi)
         }
  
         /* sst flash chips use AAI word program */
-       if (JEDEC_MFR(info->jedec_id) == CFI_MFR_SST)
+       if (info->flags & SST_WRITE)
                 flash->mtd._write = sst_write;
         else
                 flash->mtd._write = m25p80_write;
diff --git a/drivers/mtd/devices/mtd_dataflash.c b/drivers/mtd/devices/mtd_dataflash.c

index 945c9f7623499fb3798314e508ad65779755b863..28779b6dfcd98fc6c8b2ed5b49800d898fee7e81 100644 (file)
--- a/drivers/mtd/devices/mtd_dataflash.c
+++ b/drivers/mtd/devices/mtd_dataflash.c
@@ -105,8 +105,6 @@ static const struct of_device_id dataflash_dt_ids[] = {
         { .compatible = "atmel,dataflash", },
         { /* sentinel */ }
  };
-#else
-#define dataflash_dt_ids NULL
  #endif
  
  /* ......................................................................... */
@@ -914,7 +912,7 @@ static struct spi_driver dataflash_driver = {
         .driver = {
                 .name           = "mtd_dataflash",
                 .owner          = THIS_MODULE,
-               .of_match_table = dataflash_dt_ids,
+               .of_match_table = of_match_ptr(dataflash_dt_ids),
         },
  
         .probe          = dataflash_probe,
diff --git a/drivers/mtd/maps/Kconfig b/drivers/mtd/maps/Kconfig

index 3ed17c4d4358ad22b18ea9c77bb359b065e53ee8..bed9d58d5741246f58bafe178851e8faf42744e2 100644 (file)
--- a/drivers/mtd/maps/Kconfig
+++ b/drivers/mtd/maps/Kconfig
@@ -249,22 +249,6 @@ config MTD_LANTIQ
         help
           Support for NOR flash attached to the Lantiq SoC's External Bus Unit.
  
-config MTD_DILNETPC
-       tristate "CFI Flash device mapped on DIL/Net PC"
-       depends on X86 && MTD_CFI_INTELEXT && BROKEN
-       help
-         MTD map driver for SSV DIL/Net PC Boards "DNP" and "ADNP".
-         For details, see <http://www.ssv-embedded.de/ssv/pc104/p169.htm>
-         and <http://www.ssv-embedded.de/ssv/pc104/p170.htm>
-
-config MTD_DILNETPC_BOOTSIZE
-       hex "Size of DIL/Net PC flash boot partition"
-       depends on MTD_DILNETPC
-       default "0x80000"
-       help
-         The amount of space taken up by the kernel or Etherboot
-         on the DIL/Net PC flash chips.
-
  config MTD_L440GX
         tristate "BIOS flash chip on Intel L440GX boards"
         depends on X86 && MTD_JEDECPROBE
@@ -274,42 +258,6 @@ config MTD_L440GX
  
           BE VERY CAREFUL.
  
-config MTD_TQM8XXL
-       tristate "CFI Flash device mapped on TQM8XXL"
-       depends on MTD_CFI && TQM8xxL
-       help
-         The TQM8xxL PowerPC board has up to two banks of CFI-compliant
-         chips, currently uses AMD one. This 'mapping' driver supports
-         that arrangement, allowing the CFI probe and command set driver
-         code to communicate with the chips on the TQM8xxL board. More at
-         <http://www.denx.de/wiki/PPCEmbedded/>.
-
-config MTD_RPXLITE
-       tristate "CFI Flash device mapped on RPX Lite or CLLF"
-       depends on MTD_CFI && (RPXCLASSIC || RPXLITE)
-       help
-         The RPXLite PowerPC board has CFI-compliant chips mapped in
-         a strange sparse mapping. This 'mapping' driver supports that
-         arrangement, allowing the CFI probe and command set driver code
-         to communicate with the chips on the RPXLite board. More at
-         <http://www.embeddedplanet.com/>.
-
-config MTD_MBX860
-       tristate "System flash on MBX860 board"
-       depends on MTD_CFI && MBX
-       help
-         This enables access routines for the flash chips on the Motorola
-         MBX860 board. If you have one of these boards and would like
-         to use the flash chips on it, say 'Y'.
-
-config MTD_DBOX2
-       tristate "CFI Flash device mapped on D-Box2"
-       depends on DBOX2 && MTD_CFI_INTELSTD && MTD_CFI_INTELEXT && MTD_CFI_AMDSTD
-       help
-         This enables access routines for the flash chips on the Nokia/Sagem
-         D-Box 2 board. If you have one of these boards and would like to use
-         the flash chips on it, say 'Y'.
-
  config MTD_CFI_FLAGADM
         tristate "CFI Flash device mapping on FlagaDM"
         depends on 8xx && MTD_CFI
@@ -349,15 +297,6 @@ config MTD_IXP4XX
           IXDP425 and Coyote. If you have an IXP4xx based board and
           would like to use the flash chips on it, say 'Y'.
  
-config MTD_IXP2000
-       tristate "CFI Flash device mapped on Intel IXP2000 based systems"
-       depends on MTD_CFI && MTD_COMPLEX_MAPPINGS && ARCH_IXP2000
-       help
-         This enables MTD access to flash devices on platforms based
-         on Intel's IXP2000 family of network processors. If you have an
-         IXP2000 based board and would like to use the flash chips on it,
-         say 'Y'.
-
  config MTD_AUTCPU12
         bool "NV-RAM mapping AUTCPU12 board"
         depends on ARCH_AUTCPU12
@@ -372,13 +311,6 @@ config MTD_IMPA7
           This enables access to the NOR Flash on the impA7 board of
           implementa GmbH. If you have such a board, say 'Y' here.
  
-config MTD_H720X
-       tristate "Hynix evaluation board mappings"
-       depends on MTD_CFI && ( ARCH_H7201 || ARCH_H7202 )
-       help
-         This enables access to the flash chips on the Hynix evaluation boards.
-         If you have such a board, say 'Y'.
-
  # This needs CFI or JEDEC, depending on the cards found.
  config MTD_PCI
         tristate "PCI MTD driver"
@@ -419,7 +351,7 @@ config MTD_BFIN_ASYNC
  
  config MTD_GPIO_ADDR
         tristate "GPIO-assisted Flash Chip Support"
-       depends on GENERIC_GPIO || GPIOLIB
+       depends on GPIOLIB
         depends on MTD_COMPLEX_MAPPINGS
         help
           Map driver which allows flashes to be partially physically addressed
@@ -433,15 +365,6 @@ config MTD_UCLINUX
         help
           Map driver to support image based filesystems for uClinux.
  
-config MTD_DMV182
-        tristate "Map driver for Dy-4 SVME/DMV-182 board."
-        depends on DMV182
-       select MTD_MAP_BANK_WIDTH_32
-       select MTD_CFI_I8
-       select MTD_CFI_AMDSTD
-        help
-          Map driver for Dy-4 SVME/DMV-182 board.
-
  config MTD_INTEL_VR_NOR
         tristate "NOR flash on Intel Vermilion Range Expansion Bus CS0"
         depends on PCI
diff --git a/drivers/mtd/maps/Makefile b/drivers/mtd/maps/Makefile

index 4ded28711bc11a0b53789d74c49c5e0842729ce3..395a12444048e6f2c7fbaa64d028be488551e187 100644 (file)
--- a/drivers/mtd/maps/Makefile
+++ b/drivers/mtd/maps/Makefile
@@ -9,7 +9,6 @@ endif
  # Chip mappings
  obj-$(CONFIG_MTD_CFI_FLAGADM)  += cfi_flagadm.o
  obj-$(CONFIG_MTD_DC21285)      += dc21285.o
-obj-$(CONFIG_MTD_DILNETPC)     += dilnetpc.o
  obj-$(CONFIG_MTD_L440GX)       += l440gx.o
  obj-$(CONFIG_MTD_AMD76XROM)    += amd76xrom.o
  obj-$(CONFIG_MTD_ESB2ROM)      += esb2rom.o
@@ -17,15 +16,12 @@ obj-$(CONFIG_MTD_ICHXROM)   += ichxrom.o
  obj-$(CONFIG_MTD_CK804XROM)    += ck804xrom.o
  obj-$(CONFIG_MTD_TSUNAMI)      += tsunami_flash.o
  obj-$(CONFIG_MTD_PXA2XX)       += pxa2xx-flash.o
-obj-$(CONFIG_MTD_MBX860)       += mbx860.o
  obj-$(CONFIG_MTD_OCTAGON)      += octagon-5066.o
  obj-$(CONFIG_MTD_PHYSMAP)      += physmap.o
  obj-$(CONFIG_MTD_PHYSMAP_OF)   += physmap_of.o
  obj-$(CONFIG_MTD_PISMO)                += pismo.o
  obj-$(CONFIG_MTD_PMC_MSP_EVM)   += pmcmsp-flash.o
  obj-$(CONFIG_MTD_PCMCIA)       += pcmciamtd.o
-obj-$(CONFIG_MTD_RPXLITE)      += rpxlite.o
-obj-$(CONFIG_MTD_TQM8XXL)      += tqm8xxl.o
  obj-$(CONFIG_MTD_SA1100)       += sa1100-flash.o
  obj-$(CONFIG_MTD_SBC_GXX)      += sbc_gxx.o
  obj-$(CONFIG_MTD_SC520CDP)     += sc520cdp.o
@@ -34,7 +30,6 @@ obj-$(CONFIG_MTD_TS5500)      += ts5500_flash.o
  obj-$(CONFIG_MTD_SUN_UFLASH)   += sun_uflash.o
  obj-$(CONFIG_MTD_VMAX)         += vmax301.o
  obj-$(CONFIG_MTD_SCx200_DOCFLASH)+= scx200_docflash.o
-obj-$(CONFIG_MTD_DBOX2)                += dbox2-flash.o
  obj-$(CONFIG_MTD_SOLUTIONENGINE)+= solutionengine.o
  obj-$(CONFIG_MTD_PCI)          += pci.o
  obj-$(CONFIG_MTD_AUTCPU12)     += autcpu12-nvram.o
@@ -42,10 +37,7 @@ obj-$(CONFIG_MTD_IMPA7)              += impa7.o
  obj-$(CONFIG_MTD_UCLINUX)      += uclinux.o
  obj-$(CONFIG_MTD_NETtel)       += nettel.o
  obj-$(CONFIG_MTD_SCB2_FLASH)   += scb2_flash.o
-obj-$(CONFIG_MTD_H720X)                += h720x-flash.o
  obj-$(CONFIG_MTD_IXP4XX)       += ixp4xx.o
-obj-$(CONFIG_MTD_IXP2000)      += ixp2000.o
-obj-$(CONFIG_MTD_DMV182)       += dmv182.o
  obj-$(CONFIG_MTD_PLATRAM)      += plat-ram.o
  obj-$(CONFIG_MTD_INTEL_VR_NOR) += intel_vr_nor.o
  obj-$(CONFIG_MTD_BFIN_ASYNC)   += bfin-async-flash.o
diff --git a/drivers/mtd/maps/bfin-async-flash.c b/drivers/mtd/maps/bfin-async-flash.c

index f833edfaab79f7119f28a811ea6abd56318f90b3..319b04a6c9d1f6a0ccc5c565c977740d63babaa9 100644 (file)
--- a/drivers/mtd/maps/bfin-async-flash.c
+++ b/drivers/mtd/maps/bfin-async-flash.c
@@ -122,7 +122,8 @@ static void bfin_flash_copy_to(struct map_info *map, unsigned long to, const voi
         switch_back(state);
  }
  
-static const char *part_probe_types[] = { "cmdlinepart", "RedBoot", NULL };
+static const char * const part_probe_types[] = {
+       "cmdlinepart", "RedBoot", NULL };
  
  static int bfin_flash_probe(struct platform_device *pdev)
  {
diff --git a/drivers/mtd/maps/ck804xrom.c b/drivers/mtd/maps/ck804xrom.c

index 586a1c77e48a7f29a6459e11387ad4d76c412383..0455166f05faeaf364252bf0ddd39ba1fa24931b 100644 (file)
--- a/drivers/mtd/maps/ck804xrom.c
+++ b/drivers/mtd/maps/ck804xrom.c
@@ -308,8 +308,7 @@ static int ck804xrom_init_one(struct pci_dev *pdev,
  
   out:
         /* Free any left over map structures */
-       if (map)
-               kfree(map);
+       kfree(map);
  
         /* See if I have any map structures */
         if (list_empty(&window->maps)) {
diff --git a/drivers/mtd/maps/dbox2-flash.c b/drivers/mtd/maps/dbox2-flash.c

deleted file mode 100644 (file)

index 85bdece..0000000
--- a/drivers/mtd/maps/dbox2-flash.c
+++ /dev/null
@@ -1,123 +0,0 @@
-/*
- * D-Box 2 flash driver
- */
-
-#include <linux/module.h>
-#include <linux/types.h>
-#include <linux/kernel.h>
-#include <linux/init.h>
-#include <asm/io.h>
-#include <linux/mtd/mtd.h>
-#include <linux/mtd/map.h>
-#include <linux/mtd/partitions.h>
-#include <linux/errno.h>
-
-/* partition_info gives details on the logical partitions that the split the
- * single flash device into. If the size if zero we use up to the end of the
- * device. */
-static struct mtd_partition partition_info[]= {
-       {
-       .name           = "BR bootloader",
-       .size           = 128 * 1024,
-       .offset         = 0,
-       .mask_flags     = MTD_WRITEABLE
-       },
-       {
-       .name           = "FLFS (U-Boot)",
-       .size           = 128 * 1024,
-       .offset         = MTDPART_OFS_APPEND,
-       .mask_flags     = 0
-       },
-       {
-       .name           = "Root (SquashFS)",
-       .size           = 7040 * 1024,
-       .offset         = MTDPART_OFS_APPEND,
-       .mask_flags     = 0
-       },
-       {
-       .name           = "var (JFFS2)",
-       .size           = 896 * 1024,
-       .offset         = MTDPART_OFS_APPEND,
-       .mask_flags     = 0
-       },
-       {
-       .name           = "Flash without bootloader",
-       .size           = MTDPART_SIZ_FULL,
-       .offset         = 128 * 1024,
-       .mask_flags     = 0
-       },
-       {
-       .name           = "Complete Flash",
-       .size           = MTDPART_SIZ_FULL,
-       .offset         = 0,
-       .mask_flags     = MTD_WRITEABLE
-       }
-};
-
-#define NUM_PARTITIONS ARRAY_SIZE(partition_info)
-
-#define WINDOW_ADDR 0x10000000
-#define WINDOW_SIZE 0x800000
-
-static struct mtd_info *mymtd;
-
-
-struct map_info dbox2_flash_map = {
-       .name           = "D-Box 2 flash memory",
-       .size           = WINDOW_SIZE,
-       .bankwidth      = 4,
-       .phys           = WINDOW_ADDR,
-};
-
-static int __init init_dbox2_flash(void)
-{
-               printk(KERN_NOTICE "D-Box 2 flash driver (size->0x%X mem->0x%X)\n", WINDOW_SIZE, WINDOW_ADDR);
-       dbox2_flash_map.virt = ioremap(WINDOW_ADDR, WINDOW_SIZE);
-
-       if (!dbox2_flash_map.virt) {
-               printk("Failed to ioremap\n");
-               return -EIO;
-       }
-       simple_map_init(&dbox2_flash_map);
-
-       // Probe for dual Intel 28F320 or dual AMD
-       mymtd = do_map_probe("cfi_probe", &dbox2_flash_map);
-       if (!mymtd) {
-           // Probe for single Intel 28F640
-           dbox2_flash_map.bankwidth = 2;
-
-           mymtd = do_map_probe("cfi_probe", &dbox2_flash_map);
-       }
-
-       if (mymtd) {
-               mymtd->owner = THIS_MODULE;
-
-                /* Create MTD devices for each partition. */
-               mtd_device_register(mymtd, partition_info, NUM_PARTITIONS);
-
-               return 0;
-       }
-
-       iounmap((void *)dbox2_flash_map.virt);
-       return -ENXIO;
-}
-
-static void __exit cleanup_dbox2_flash(void)
-{
-       if (mymtd) {
-               mtd_device_unregister(mymtd);
-               map_destroy(mymtd);
-       }
-       if (dbox2_flash_map.virt) {
-               iounmap((void *)dbox2_flash_map.virt);
-               dbox2_flash_map.virt = 0;
-       }
-}
-
-module_init(init_dbox2_flash);
-module_exit(cleanup_dbox2_flash);
-
-
-MODULE_LICENSE("GPL");
-MODULE_AUTHOR("Kári Davíðsson <kd@flaga.is>, Bastian Blank <waldi@tuxbox.org>, Alexander Wild <wild@te-elektronik.com>");
-MODULE_DESCRIPTION("MTD map driver for D-Box 2 board");
diff --git a/drivers/mtd/maps/dc21285.c b/drivers/mtd/maps/dc21285.c

index 080f06053bd449d51467f3ab294acfe05289ed34..f8a7dd14cee0cc7a8b6767f0ef281ea95dd89149 100644 (file)
--- a/drivers/mtd/maps/dc21285.c
+++ b/drivers/mtd/maps/dc21285.c
@@ -143,9 +143,8 @@ static struct map_info dc21285_map = {
         .copy_from = dc21285_copy_from,
  };
  
-
  /* Partition stuff */
-static const char *probes[] = { "RedBoot", "cmdlinepart", NULL };
+static const char * const probes[] = { "RedBoot", "cmdlinepart", NULL };
  
  static int __init init_dc21285(void)
  {
diff --git a/drivers/mtd/maps/dilnetpc.c b/drivers/mtd/maps/dilnetpc.c

deleted file mode 100644 (file)

index 3e393f0..0000000
--- a/drivers/mtd/maps/dilnetpc.c
+++ /dev/null
@@ -1,496 +0,0 @@
-/* dilnetpc.c -- MTD map driver for SSV DIL/Net PC Boards "DNP" and "ADNP"
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License as published by
- * the Free Software Foundation; either version 2 of the License, or
- * (at your option) any later version.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, write to the Free Software
- * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA
- *
- * The DIL/Net PC is a tiny embedded PC board made by SSV Embedded Systems
- * featuring the AMD Elan SC410 processor. There are two variants of this
- * board: DNP/1486 and ADNP/1486. The DNP version has 2 megs of flash
- * ROM (Intel 28F016S3) and 8 megs of DRAM, the ADNP version has 4 megs
- * flash and 16 megs of RAM.
- * For details, see http://www.ssv-embedded.de/ssv/pc104/p169.htm
- * and http://www.ssv-embedded.de/ssv/pc104/p170.htm
- */
-
-#include <linux/module.h>
-#include <linux/types.h>
-#include <linux/kernel.h>
-#include <linux/init.h>
-#include <linux/string.h>
-
-#include <linux/mtd/mtd.h>
-#include <linux/mtd/map.h>
-#include <linux/mtd/partitions.h>
-#include <linux/mtd/concat.h>
-
-#include <asm/io.h>
-
-/*
-** The DIL/NetPC keeps its BIOS in two distinct flash blocks.
-** Destroying any of these blocks transforms the DNPC into
-** a paperweight (albeit not a very useful one, considering
-** it only weighs a few grams).
-**
-** Therefore, the BIOS blocks must never be erased or written to
-** except by people who know exactly what they are doing (e.g.
-** to install a BIOS update). These partitions are marked read-only
-** by default, but can be made read/write by undefining
-** DNPC_BIOS_BLOCKS_WRITEPROTECTED:
-*/
-#define DNPC_BIOS_BLOCKS_WRITEPROTECTED
-
-/*
-** The ID string (in ROM) is checked to determine whether we
-** are running on a DNP/1486 or ADNP/1486
-*/
-#define BIOSID_BASE    0x000fe100
-
-#define ID_DNPC        "DNP1486"
-#define ID_ADNP        "ADNP1486"
-
-/*
-** Address where the flash should appear in CPU space
-*/
-#define FLASH_BASE     0x2000000
-
-/*
-** Chip Setup and Control (CSC) indexed register space
-*/
-#define CSC_INDEX      0x22
-#define CSC_DATA       0x23
-
-#define CSC_MMSWAR     0x30    /* MMS window C-F attributes register */
-#define CSC_MMSWDSR    0x31    /* MMS window C-F device select register */
-
-#define CSC_RBWR       0xa7    /* GPIO Read-Back/Write Register B */
-
-#define CSC_CR         0xd0    /* internal I/O device disable/Echo */
-                               /* Z-bus/configuration register */
-
-#define CSC_PCCMDCR    0xf1    /* PC card mode and DMA control register */
-
-
-/*
-** PC Card indexed register space:
-*/
-
-#define PCC_INDEX      0x3e0
-#define PCC_DATA       0x3e1
-
-#define PCC_AWER_B             0x46    /* Socket B Address Window enable register */
-#define PCC_MWSAR_1_Lo 0x58    /* memory window 1 start address low register */
-#define PCC_MWSAR_1_Hi 0x59    /* memory window 1 start address high register */
-#define PCC_MWEAR_1_Lo 0x5A    /* memory window 1 stop address low register */
-#define PCC_MWEAR_1_Hi 0x5B    /* memory window 1 stop address high register */
-#define PCC_MWAOR_1_Lo 0x5C    /* memory window 1 address offset low register */
-#define PCC_MWAOR_1_Hi 0x5D    /* memory window 1 address offset high register */
-
-
-/*
-** Access to SC4x0's Chip Setup and Control (CSC)
-** and PC Card (PCC) indexed registers:
-*/
-static inline void setcsc(int reg, unsigned char data)
-{
-       outb(reg, CSC_INDEX);
-       outb(data, CSC_DATA);
-}
-
-static inline unsigned char getcsc(int reg)
-{
-       outb(reg, CSC_INDEX);
-       return(inb(CSC_DATA));
-}
-
-static inline void setpcc(int reg, unsigned char data)
-{
-       outb(reg, PCC_INDEX);
-       outb(data, PCC_DATA);
-}
-
-static inline unsigned char getpcc(int reg)
-{
-       outb(reg, PCC_INDEX);
-       return(inb(PCC_DATA));
-}
-
-
-/*
-************************************************************
-** Enable access to DIL/NetPC's flash by mapping it into
-** the SC4x0's MMS Window C.
-************************************************************
-*/
-static void dnpc_map_flash(unsigned long flash_base, unsigned long flash_size)
-{
-       unsigned long flash_end = flash_base + flash_size - 1;
-
-       /*
-       ** enable setup of MMS windows C-F:
-       */
-       /* - enable PC Card indexed register space */
-       setcsc(CSC_CR, getcsc(CSC_CR) | 0x2);
-       /* - set PC Card controller to operate in standard mode */
-       setcsc(CSC_PCCMDCR, getcsc(CSC_PCCMDCR) & ~1);
-
-       /*
-       ** Program base address and end address of window
-       ** where the flash ROM should appear in CPU address space
-       */
-       setpcc(PCC_MWSAR_1_Lo, (flash_base >> 12) & 0xff);
-       setpcc(PCC_MWSAR_1_Hi, (flash_base >> 20) & 0x3f);
-       setpcc(PCC_MWEAR_1_Lo, (flash_end >> 12) & 0xff);
-       setpcc(PCC_MWEAR_1_Hi, (flash_end >> 20) & 0x3f);
-
-       /* program offset of first flash location to appear in this window (0) */
-       setpcc(PCC_MWAOR_1_Lo, ((0 - flash_base) >> 12) & 0xff);
-       setpcc(PCC_MWAOR_1_Hi, ((0 - flash_base)>> 20) & 0x3f);
-
-       /* set attributes for MMS window C: non-cacheable, write-enabled */
-       setcsc(CSC_MMSWAR, getcsc(CSC_MMSWAR) & ~0x11);
-
-       /* select physical device ROMCS0 (i.e. flash) for MMS Window C */
-       setcsc(CSC_MMSWDSR, getcsc(CSC_MMSWDSR) & ~0x03);
-
-       /* enable memory window 1 */
-       setpcc(PCC_AWER_B, getpcc(PCC_AWER_B) | 0x02);
-
-       /* now disable PC Card indexed register space again */
-       setcsc(CSC_CR, getcsc(CSC_CR) & ~0x2);
-}
-
-
-/*
-************************************************************
-** Disable access to DIL/NetPC's flash by mapping it into
-** the SC4x0's MMS Window C.
-************************************************************
-*/
-static void dnpc_unmap_flash(void)
-{
-       /* - enable PC Card indexed register space */
-       setcsc(CSC_CR, getcsc(CSC_CR) | 0x2);
-
-       /* disable memory window 1 */
-       setpcc(PCC_AWER_B, getpcc(PCC_AWER_B) & ~0x02);
-
-       /* now disable PC Card indexed register space again */
-       setcsc(CSC_CR, getcsc(CSC_CR) & ~0x2);
-}
-
-
-
-/*
-************************************************************
-** Enable/Disable VPP to write to flash
-************************************************************
-*/
-
-static DEFINE_SPINLOCK(dnpc_spin);
-static int        vpp_counter = 0;
-/*
-** This is what has to be done for the DNP board ..
-*/
-static void dnp_set_vpp(struct map_info *not_used, int on)
-{
-       spin_lock_irq(&dnpc_spin);
-
-       if (on)
-       {
-               if(++vpp_counter == 1)
-                       setcsc(CSC_RBWR, getcsc(CSC_RBWR) & ~0x4);
-       }
-       else
-       {
-               if(--vpp_counter == 0)
-                       setcsc(CSC_RBWR, getcsc(CSC_RBWR) | 0x4);
-               else
-                       BUG_ON(vpp_counter < 0);
-       }
-       spin_unlock_irq(&dnpc_spin);
-}
-
-/*
-** .. and this the ADNP version:
-*/
-static void adnp_set_vpp(struct map_info *not_used, int on)
-{
-       spin_lock_irq(&dnpc_spin);
-
-       if (on)
-       {
-               if(++vpp_counter == 1)
-                       setcsc(CSC_RBWR, getcsc(CSC_RBWR) & ~0x8);
-       }
-       else
-       {
-               if(--vpp_counter == 0)
-                       setcsc(CSC_RBWR, getcsc(CSC_RBWR) | 0x8);
-               else
-                       BUG_ON(vpp_counter < 0);
-       }
-       spin_unlock_irq(&dnpc_spin);
-}
-
-
-
-#define DNP_WINDOW_SIZE                0x00200000      /*  DNP flash size is 2MiB  */
-#define ADNP_WINDOW_SIZE       0x00400000      /* ADNP flash size is 4MiB */
-#define WINDOW_ADDR            FLASH_BASE
-
-static struct map_info dnpc_map = {
-       .name = "ADNP Flash Bank",
-       .size = ADNP_WINDOW_SIZE,
-       .bankwidth = 1,
-       .set_vpp = adnp_set_vpp,
-       .phys = WINDOW_ADDR
-};
-
-/*
-** The layout of the flash is somewhat "strange":
-**
-** 1.  960 KiB (15 blocks) : Space for ROM Bootloader and user data
-** 2.   64 KiB (1 block)   : System BIOS
-** 3.  960 KiB (15 blocks) : User Data (DNP model) or
-** 3. 3008 KiB (47 blocks) : User Data (ADNP model)
-** 4.   64 KiB (1 block)   : System BIOS Entry
-*/
-
-static struct mtd_partition partition_info[]=
-{
-       {
-               .name =         "ADNP boot",
-               .offset =       0,
-               .size =         0xf0000,
-       },
-       {
-               .name =         "ADNP system BIOS",
-               .offset =       MTDPART_OFS_NXTBLK,
-               .size =         0x10000,
-#ifdef DNPC_BIOS_BLOCKS_WRITEPROTECTED
-               .mask_flags =   MTD_WRITEABLE,
-#endif
-       },
-       {
-               .name =         "ADNP file system",
-               .offset =       MTDPART_OFS_NXTBLK,
-               .size =         0x2f0000,
-       },
-       {
-               .name =         "ADNP system BIOS entry",
-               .offset =       MTDPART_OFS_NXTBLK,
-               .size =         MTDPART_SIZ_FULL,
-#ifdef DNPC_BIOS_BLOCKS_WRITEPROTECTED
-               .mask_flags =   MTD_WRITEABLE,
-#endif
-       },
-};
-
-#define NUM_PARTITIONS ARRAY_SIZE(partition_info)
-
-static struct mtd_info *mymtd;
-static struct mtd_info *lowlvl_parts[NUM_PARTITIONS];
-static struct mtd_info *merged_mtd;
-
-/*
-** "Highlevel" partition info:
-**
-** Using the MTD concat layer, we can re-arrange partitions to our
-** liking: we construct a virtual MTD device by concatenating the
-** partitions, specifying the sequence such that the boot block
-** is immediately followed by the filesystem block (i.e. the stupid
-** system BIOS block is mapped to a different place). When re-partitioning
-** this concatenated MTD device, we can set the boot block size to
-** an arbitrary (though erase block aligned) value i.e. not one that
-** is dictated by the flash's physical layout. We can thus set the
-** boot block to be e.g. 64 KB (which is fully sufficient if we want
-** to boot an etherboot image) or to -say- 1.5 MB if we want to boot
-** a large kernel image. In all cases, the remainder of the flash
-** is available as file system space.
-*/
-
-static struct mtd_partition higlvl_partition_info[]=
-{
-       {
-               .name =         "ADNP boot block",
-               .offset =       0,
-               .size =         CONFIG_MTD_DILNETPC_BOOTSIZE,
-       },
-       {
-               .name =         "ADNP file system space",
-               .offset =       MTDPART_OFS_NXTBLK,
-               .size =         ADNP_WINDOW_SIZE-CONFIG_MTD_DILNETPC_BOOTSIZE-0x20000,
-       },
-       {
-               .name =         "ADNP system BIOS + BIOS Entry",
-               .offset =       MTDPART_OFS_NXTBLK,
-               .size =         MTDPART_SIZ_FULL,
-#ifdef DNPC_BIOS_BLOCKS_WRITEPROTECTED
-               .mask_flags =   MTD_WRITEABLE,
-#endif
-       },
-};
-
-#define NUM_HIGHLVL_PARTITIONS ARRAY_SIZE(higlvl_partition_info)
-
-
-static int dnp_adnp_probe(void)
-{
-       char *biosid, rc = -1;
-
-       biosid = (char*)ioremap(BIOSID_BASE, 16);
-       if(biosid)
-       {
-               if(!strcmp(biosid, ID_DNPC))
-                       rc = 1;         /* this is a DNPC  */
-               else if(!strcmp(biosid, ID_ADNP))
-                       rc = 0;         /* this is a ADNPC */
-       }
-       iounmap((void *)biosid);
-       return(rc);
-}
-
-
-static int __init init_dnpc(void)
-{
-       int is_dnp;
-
-       /*
-       ** determine hardware (DNP/ADNP/invalid)
-       */
-       if((is_dnp = dnp_adnp_probe()) < 0)
-               return -ENXIO;
-
-       /*
-       ** Things are set up for ADNP by default
-       ** -> modify all that needs to be different for DNP
-       */
-       if(is_dnp)
-       {       /*
-               ** Adjust window size, select correct set_vpp function.
-               ** The partitioning scheme is identical on both DNP
-               ** and ADNP except for the size of the third partition.
-               */
-               int i;
-               dnpc_map.size          = DNP_WINDOW_SIZE;
-               dnpc_map.set_vpp       = dnp_set_vpp;
-               partition_info[2].size = 0xf0000;
-
-               /*
-               ** increment all string pointers so the leading 'A' gets skipped,
-               ** thus turning all occurrences of "ADNP ..." into "DNP ..."
-               */
-               ++dnpc_map.name;
-               for(i = 0; i < NUM_PARTITIONS; i++)
-                       ++partition_info[i].name;
-               higlvl_partition_info[1].size = DNP_WINDOW_SIZE -
-                       CONFIG_MTD_DILNETPC_BOOTSIZE - 0x20000;
-               for(i = 0; i < NUM_HIGHLVL_PARTITIONS; i++)
-                       ++higlvl_partition_info[i].name;
-       }
-
-       printk(KERN_NOTICE "DIL/Net %s flash: 0x%lx at 0x%llx\n",
-               is_dnp ? "DNPC" : "ADNP", dnpc_map.size, (unsigned long long)dnpc_map.phys);
-
-       dnpc_map.virt = ioremap_nocache(dnpc_map.phys, dnpc_map.size);
-
-       dnpc_map_flash(dnpc_map.phys, dnpc_map.size);
-
-       if (!dnpc_map.virt) {
-               printk("Failed to ioremap_nocache\n");
-               return -EIO;
-       }
-       simple_map_init(&dnpc_map);
-
-       printk("FLASH virtual address: 0x%p\n", dnpc_map.virt);
-
-       mymtd = do_map_probe("jedec_probe", &dnpc_map);
-
-       if (!mymtd)
-               mymtd = do_map_probe("cfi_probe", &dnpc_map);
-
-       /*
-       ** If flash probes fail, try to make flashes accessible
-       ** at least as ROM. Ajust erasesize in this case since
-       ** the default one (128M) will break our partitioning
-       */
-       if (!mymtd)
-               if((mymtd = do_map_probe("map_rom", &dnpc_map)))
-                       mymtd->erasesize = 0x10000;
-
-       if (!mymtd) {
-               iounmap(dnpc_map.virt);
-               return -ENXIO;
-       }
-
-       mymtd->owner = THIS_MODULE;
-
-       /*
-       ** Supply pointers to lowlvl_parts[] array to add_mtd_partitions()
-       ** -> add_mtd_partitions() will _not_ register MTD devices for
-       ** the partitions, but will instead store pointers to the MTD
-       ** objects it creates into our lowlvl_parts[] array.
-       ** NOTE: we arrange the pointers such that the sequence of the
-       **       partitions gets re-arranged: partition #2 follows
-       **       partition #0.
-       */
-       partition_info[0].mtdp = &lowlvl_parts[0];
-       partition_info[1].mtdp = &lowlvl_parts[2];
-       partition_info[2].mtdp = &lowlvl_parts[1];
-       partition_info[3].mtdp = &lowlvl_parts[3];
-
-       mtd_device_register(mymtd, partition_info, NUM_PARTITIONS);
-
-       /*
-       ** now create a virtual MTD device by concatenating the for partitions
-       ** (in the sequence given by the lowlvl_parts[] array.
-       */
-       merged_mtd = mtd_concat_create(lowlvl_parts, NUM_PARTITIONS, "(A)DNP Flash Concatenated");
-       if(merged_mtd)
-       {       /*
-               ** now partition the new device the way we want it. This time,
-               ** we do not supply mtd pointers in higlvl_partition_info, so
-               ** add_mtd_partitions() will register the devices.
-               */
-               mtd_device_register(merged_mtd, higlvl_partition_info,
-                                   NUM_HIGHLVL_PARTITIONS);
-       }
-
-       return 0;
-}
-
-static void __exit cleanup_dnpc(void)
-{
-       if(merged_mtd) {
-               mtd_device_unregister(merged_mtd);
-               mtd_concat_destroy(merged_mtd);
-       }
-
-       if (mymtd) {
-               mtd_device_unregister(mymtd);
-               map_destroy(mymtd);
-       }
-       if (dnpc_map.virt) {
-               iounmap(dnpc_map.virt);
-               dnpc_unmap_flash();
-               dnpc_map.virt = NULL;
-       }
-}
-
-module_init(init_dnpc);
-module_exit(cleanup_dnpc);
-
-MODULE_LICENSE("GPL");
-MODULE_AUTHOR("Sysgo Real-Time Solutions GmbH");
-MODULE_DESCRIPTION("MTD map driver for SSV DIL/NetPC DNP & ADNP");
diff --git a/drivers/mtd/maps/dmv182.c b/drivers/mtd/maps/dmv182.c

deleted file mode 100644 (file)

index 6538ac6..0000000
--- a/drivers/mtd/maps/dmv182.c
+++ /dev/null
@@ -1,146 +0,0 @@
-
-/*
- * drivers/mtd/maps/dmv182.c
- *
- * Flash map driver for the Dy4 SVME182 board
- *
- * Copyright 2003-2004, TimeSys Corporation
- *
- * Based on the SVME181 flash map, by Tom Nelson, Dot4, Inc. for TimeSys Corp.
- *
- * This program is free software; you can redistribute  it and/or modify it
- * under  the terms of  the GNU General  Public License as published by the
- * Free Software Foundation;  either version 2 of the  License, or (at your
- * option) any later version.
- */
-
-#include <linux/module.h>
-#include <linux/init.h>
-#include <linux/types.h>
-#include <linux/kernel.h>
-#include <asm/io.h>
-#include <linux/mtd/mtd.h>
-#include <linux/mtd/map.h>
-#include <linux/mtd/partitions.h>
-#include <linux/errno.h>
-
-/*
- * This driver currently handles only the 16MiB user flash bank 1 on the
- * board.  It does not provide access to bank 0 (contains the Dy4 FFW), bank 2
- * (VxWorks boot), or the optional 48MiB expansion flash.
- *
- * scott.wood@timesys.com: On the newer boards with 128MiB flash, it
- * now supports the first 96MiB (the boot flash bank containing FFW
- * is excluded).  The VxWorks loader is in partition 1.
- */
-
-#define FLASH_BASE_ADDR 0xf0000000
-#define FLASH_BANK_SIZE (128*1024*1024)
-
-MODULE_AUTHOR("Scott Wood, TimeSys Corporation <scott.wood@timesys.com>");
-MODULE_DESCRIPTION("User-programmable flash device on the Dy4 SVME182 board");
-MODULE_LICENSE("GPL");
-
-static struct map_info svme182_map = {
-       .name           = "Dy4 SVME182",
-       .bankwidth      = 32,
-       .size           =  128 * 1024 * 1024
-};
-
-#define BOOTIMAGE_PART_SIZE            ((6*1024*1024)-RESERVED_PART_SIZE)
-
-// Allow 6MiB for the kernel
-#define NEW_BOOTIMAGE_PART_SIZE  (6 * 1024 * 1024)
-// Allow 1MiB for the bootloader
-#define NEW_BOOTLOADER_PART_SIZE (1024 * 1024)
-// Use the remaining 9MiB at the end of flash for the RFS
-#define NEW_RFS_PART_SIZE        (0x01000000 - NEW_BOOTLOADER_PART_SIZE - \
-                                  NEW_BOOTIMAGE_PART_SIZE)
-
-static struct mtd_partition svme182_partitions[] = {
-       // The Lower PABS is only 128KiB, but the partition code doesn't
-       // like partitions that don't end on the largest erase block
-       // size of the device, even if all of the erase blocks in the
-       // partition are small ones.  The hardware should prevent
-       // writes to the actual PABS areas.
-       {
-               name:       "Lower PABS and CPU 0 bootloader or kernel",
-               size:       6*1024*1024,
-               offset:     0,
-       },
-       {
-               name:       "Root Filesystem",
-               size:       10*1024*1024,
-               offset:     MTDPART_OFS_NXTBLK
-       },
-       {
-               name:       "CPU1 Bootloader",
-               size:       1024*1024,
-               offset:     MTDPART_OFS_NXTBLK,
-       },
-       {
-               name:       "Extra",
-               size:       110*1024*1024,
-               offset:     MTDPART_OFS_NXTBLK
-       },
-       {
-               name:       "Foundation Firmware and Upper PABS",
-               size:       1024*1024,
-               offset:     MTDPART_OFS_NXTBLK,
-               mask_flags: MTD_WRITEABLE // read-only
-       }
-};
-
-static struct mtd_info *this_mtd;
-
-static int __init init_svme182(void)
-{
-       struct mtd_partition *partitions;
-       int num_parts = ARRAY_SIZE(svme182_partitions);
-
-       partitions = svme182_partitions;
-
-       svme182_map.virt = ioremap(FLASH_BASE_ADDR, svme182_map.size);
-
-       if (svme182_map.virt == 0) {
-               printk("Failed to ioremap FLASH memory area.\n");
-               return -EIO;
-       }
-
-       simple_map_init(&svme182_map);
-
-       this_mtd = do_map_probe("cfi_probe", &svme182_map);
-       if (!this_mtd)
-       {
-               iounmap((void *)svme182_map.virt);
-               return -ENXIO;
-       }
-
-       printk(KERN_NOTICE "SVME182 flash device: %dMiB at 0x%08x\n",
-                  this_mtd->size >> 20, FLASH_BASE_ADDR);
-
-       this_mtd->owner = THIS_MODULE;
-       mtd_device_register(this_mtd, partitions, num_parts);
-
-       return 0;
-}
-
-static void __exit cleanup_svme182(void)
-{
-       if (this_mtd)
-       {
-               mtd_device_unregister(this_mtd);
-               map_destroy(this_mtd);
-       }
-
-       if (svme182_map.virt)
-       {
-               iounmap((void *)svme182_map.virt);
-               svme182_map.virt = 0;
-       }
-
-       return;
-}
-
-module_init(init_svme182);
-module_exit(cleanup_svme182);
diff --git a/drivers/mtd/maps/gpio-addr-flash.c b/drivers/mtd/maps/gpio-addr-flash.c

index 7b643de2500b35cec6169169d9356592f97c637d..5ede28294f9e49765a9a24ed12867b94cc9e3208 100644 (file)
--- a/drivers/mtd/maps/gpio-addr-flash.c
+++ b/drivers/mtd/maps/gpio-addr-flash.c
@@ -157,7 +157,8 @@ static void gf_copy_to(struct map_info *map, unsigned long to,
         memcpy_toio(map->virt + (to % state->win_size), from, len);
  }
  
-static const char *part_probe_types[] = { "cmdlinepart", "RedBoot", NULL };
+static const char * const part_probe_types[] = {
+       "cmdlinepart", "RedBoot", NULL };
  
  /**
   * gpio_flash_probe() - setup a mapping for a GPIO assisted flash
diff --git a/drivers/mtd/maps/h720x-flash.c b/drivers/mtd/maps/h720x-flash.c

deleted file mode 100644 (file)

index 8ed6cb4..0000000
--- a/drivers/mtd/maps/h720x-flash.c
+++ /dev/null
@@ -1,120 +0,0 @@
-/*
- * Flash memory access on Hynix GMS30C7201/HMS30C7202 based
- * evaluation boards
- *
- * (C) 2002 Jungjun Kim <jungjun.kim@hynix.com>
- *     2003 Thomas Gleixner <tglx@linutronix.de>
- */
-
-#include <linux/module.h>
-#include <linux/types.h>
-#include <linux/kernel.h>
-#include <linux/init.h>
-#include <linux/errno.h>
-#include <linux/slab.h>
-
-#include <linux/mtd/mtd.h>
-#include <linux/mtd/map.h>
-#include <linux/mtd/partitions.h>
-#include <mach/hardware.h>
-#include <asm/io.h>
-
-static struct mtd_info *mymtd;
-
-static struct map_info h720x_map = {
-       .name =         "H720X",
-       .bankwidth =    4,
-       .size =         H720X_FLASH_SIZE,
-       .phys =         H720X_FLASH_PHYS,
-};
-
-static struct mtd_partition h720x_partitions[] = {
-        {
-                .name = "ArMon",
-                .size = 0x00080000,
-                .offset = 0,
-                .mask_flags = MTD_WRITEABLE
-        },{
-                .name = "Env",
-                .size = 0x00040000,
-                .offset = 0x00080000,
-                .mask_flags = MTD_WRITEABLE
-        },{
-                .name = "Kernel",
-                .size = 0x00180000,
-                .offset = 0x000c0000,
-                .mask_flags = MTD_WRITEABLE
-        },{
-                .name = "Ramdisk",
-                .size = 0x00400000,
-                .offset = 0x00240000,
-                .mask_flags = MTD_WRITEABLE
-        },{
-                .name = "jffs2",
-                .size = MTDPART_SIZ_FULL,
-                .offset = MTDPART_OFS_APPEND
-        }
-};
-
-#define NUM_PARTITIONS ARRAY_SIZE(h720x_partitions)
-
-/*
- * Initialize FLASH support
- */
-static int __init h720x_mtd_init(void)
-{
-       h720x_map.virt = ioremap(h720x_map.phys, h720x_map.size);
-
-       if (!h720x_map.virt) {
-               printk(KERN_ERR "H720x-MTD: ioremap failed\n");
-               return -EIO;
-       }
-
-       simple_map_init(&h720x_map);
-
-       // Probe for flash bankwidth 4
-       printk (KERN_INFO "H720x-MTD probing 32bit FLASH\n");
-       mymtd = do_map_probe("cfi_probe", &h720x_map);
-       if (!mymtd) {
-               printk (KERN_INFO "H720x-MTD probing 16bit FLASH\n");
-           // Probe for bankwidth 2
-           h720x_map.bankwidth = 2;
-           mymtd = do_map_probe("cfi_probe", &h720x_map);
-       }
-
-       if (mymtd) {
-               mymtd->owner = THIS_MODULE;
-
-               mtd_device_parse_register(mymtd, NULL, NULL,
-                                         h720x_partitions, NUM_PARTITIONS);
-               return 0;
-       }
-
-       iounmap((void *)h720x_map.virt);
-       return -ENXIO;
-}
-
-/*
- * Cleanup
- */
-static void __exit h720x_mtd_cleanup(void)
-{
-
-       if (mymtd) {
-               mtd_device_unregister(mymtd);
-               map_destroy(mymtd);
-       }
-
-       if (h720x_map.virt) {
-               iounmap((void *)h720x_map.virt);
-               h720x_map.virt = 0;
-       }
-}
-
-
-module_init(h720x_mtd_init);
-module_exit(h720x_mtd_cleanup);
-
-MODULE_LICENSE("GPL");
-MODULE_AUTHOR("Thomas Gleixner <tglx@linutronix.de>");
-MODULE_DESCRIPTION("MTD map driver for Hynix evaluation boards");
diff --git a/drivers/mtd/maps/impa7.c b/drivers/mtd/maps/impa7.c

index 834a06c56f565ca466276474aebb2aea360fb97b..49686744d93cac9d2dae0758fd1b70618237c9c1 100644 (file)
--- a/drivers/mtd/maps/impa7.c
+++ b/drivers/mtd/maps/impa7.c
@@ -24,14 +24,12 @@
  #define NUM_FLASHBANKS 2
  #define BUSWIDTH     4
  
-/* can be { "cfi_probe", "jedec_probe", "map_rom", NULL } */
-#define PROBETYPES { "jedec_probe", NULL }
-
  #define MSG_PREFIX "impA7:"   /* prefix for our printk()'s */
  #define MTDID      "impa7-%d"  /* for mtdparts= partitioning */
  
  static struct mtd_info *impa7_mtd[NUM_FLASHBANKS];
  
+static const char * const rom_probe_types[] = { "jedec_probe", NULL };
  
  static struct map_info impa7_map[NUM_FLASHBANKS] = {
         {
@@ -60,8 +58,7 @@ static struct mtd_partition partitions[] =
  
  static int __init init_impa7(void)
  {
-       static const char *rom_probe_types[] = PROBETYPES;
-       const char **type;
+       const char * const *type;
         int i;
         static struct { u_long addr; u_long size; } pt[NUM_FLASHBANKS] = {
           { WINDOW_ADDR0, WINDOW_SIZE0 },
diff --git a/drivers/mtd/maps/intel_vr_nor.c b/drivers/mtd/maps/intel_vr_nor.c

index b14053b2502618e89fc7889ff385bc68db8f074d..f581ac1cf022fb006be3b814f6f1b23341f9e851 100644 (file)
--- a/drivers/mtd/maps/intel_vr_nor.c
+++ b/drivers/mtd/maps/intel_vr_nor.c
@@ -82,9 +82,9 @@ static void vr_nor_destroy_mtd_setup(struct vr_nor_mtd *p)
  
  static int vr_nor_mtd_setup(struct vr_nor_mtd *p)
  {
-       static const char *probe_types[] =
+       static const char * const probe_types[] =
             { "cfi_probe", "jedec_probe", NULL };
-       const char **type;
+       const char * const *type;
  
         for (type = probe_types; !p->info && *type; type++)
                 p->info = do_map_probe(*type, &p->map);
diff --git a/drivers/mtd/maps/ixp2000.c b/drivers/mtd/maps/ixp2000.c

deleted file mode 100644 (file)

index 4a41ced..0000000
--- a/drivers/mtd/maps/ixp2000.c
+++ /dev/null
@@ -1,253 +0,0 @@
-/*
- * drivers/mtd/maps/ixp2000.c
- *
- * Mapping for the Intel XScale IXP2000 based systems
- *
- * Copyright (C) 2002 Intel Corp.
- * Copyright (C) 2003-2004 MontaVista Software, Inc.
- *
- * Original Author: Naeem M Afzal <naeem.m.afzal@intel.com>
- * Maintainer: Deepak Saxena <dsaxena@plexity.net>
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License version 2 as
- * published by the Free Software Foundation.
- *
- */
-
-#include <linux/module.h>
-#include <linux/types.h>
-#include <linux/init.h>
-#include <linux/kernel.h>
-#include <linux/string.h>
-#include <linux/slab.h>
-#include <linux/ioport.h>
-#include <linux/device.h>
-#include <linux/platform_device.h>
-
-#include <linux/mtd/mtd.h>
-#include <linux/mtd/map.h>
-#include <linux/mtd/partitions.h>
-
-#include <asm/io.h>
-#include <mach/hardware.h>
-#include <asm/mach/flash.h>
-
-#include <linux/reboot.h>
-
-struct ixp2000_flash_info {
-       struct          mtd_info *mtd;
-       struct          map_info map;
-       struct          resource *res;
-};
-
-static inline unsigned long flash_bank_setup(struct map_info *map, unsigned long ofs)
-{
-       unsigned long (*set_bank)(unsigned long) =
-               (unsigned long(*)(unsigned long))map->map_priv_2;
-
-       return (set_bank ? set_bank(ofs) : ofs);
-}
-
-#ifdef __ARMEB__
-/*
- * Rev A0 and A1 of IXP2400 silicon have a broken addressing unit which
- * causes the lower address bits to be XORed with 0x11 on 8 bit accesses
- * and XORed with 0x10 on 16 bit accesses. See the spec update, erratum 44.
- */
-static int erratum44_workaround = 0;
-
-static inline unsigned long address_fix8_write(unsigned long addr)
-{
-       if (erratum44_workaround) {
-               return (addr ^ 3);
-       }
-       return addr;
-}
-#else
-
-#define address_fix8_write(x)  (x)
-#endif
-
-static map_word ixp2000_flash_read8(struct map_info *map, unsigned long ofs)
-{
-       map_word val;
-
-       val.x[0] =  *((u8 *)(map->map_priv_1 + flash_bank_setup(map, ofs)));
-       return val;
-}
-
-/*
- * We can't use the standard memcpy due to the broken SlowPort
- * address translation on rev A0 and A1 silicon and the fact that
- * we have banked flash.
- */
-static void ixp2000_flash_copy_from(struct map_info *map, void *to,
-                             unsigned long from, ssize_t len)
-{
-       from = flash_bank_setup(map, from);
-       while(len--)
-               *(__u8 *) to++ = *(__u8 *)(map->map_priv_1 + from++);
-}
-
-static void ixp2000_flash_write8(struct map_info *map, map_word d, unsigned long ofs)
-{
-       *(__u8 *) (address_fix8_write(map->map_priv_1 +
-                                     flash_bank_setup(map, ofs))) = d.x[0];
-}
-
-static void ixp2000_flash_copy_to(struct map_info *map, unsigned long to,
-                           const void *from, ssize_t len)
-{
-       to = flash_bank_setup(map, to);
-       while(len--) {
-               unsigned long tmp = address_fix8_write(map->map_priv_1 + to++);
-               *(__u8 *)(tmp) = *(__u8 *)(from++);
-       }
-}
-
-
-static int ixp2000_flash_remove(struct platform_device *dev)
-{
-       struct flash_platform_data *plat = dev->dev.platform_data;
-       struct ixp2000_flash_info *info = platform_get_drvdata(dev);
-
-       platform_set_drvdata(dev, NULL);
-
-       if(!info)
-               return 0;
-
-       if (info->mtd) {
-               mtd_device_unregister(info->mtd);
-               map_destroy(info->mtd);
-       }
-       if (info->map.map_priv_1)
-               iounmap((void *) info->map.map_priv_1);
-
-       if (info->res) {
-               release_resource(info->res);
-               kfree(info->res);
-       }
-
-       if (plat->exit)
-               plat->exit();
-
-       return 0;
-}
-
-
-static int ixp2000_flash_probe(struct platform_device *dev)
-{
-       static const char *probes[] = { "RedBoot", "cmdlinepart", NULL };
-       struct ixp2000_flash_data *ixp_data = dev->dev.platform_data;
-       struct flash_platform_data *plat;
-       struct ixp2000_flash_info *info;
-       unsigned long window_size;
-       int err = -1;
-
-       if (!ixp_data)
-               return -ENODEV;
-
-       plat = ixp_data->platform_data;
-       if (!plat)
-               return -ENODEV;
-
-       window_size = resource_size(dev->resource);
-       dev_info(&dev->dev, "Probe of IXP2000 flash(%d banks x %dMiB)\n",
-                ixp_data->nr_banks, ((u32)window_size >> 20));
-
-       if (plat->width != 1) {
-               dev_err(&dev->dev, "IXP2000 MTD map only supports 8-bit mode, asking for %d\n",
-                       plat->width * 8);
-               return -EIO;
-       }
-
-       info = kzalloc(sizeof(struct ixp2000_flash_info), GFP_KERNEL);
-       if(!info) {
-               err = -ENOMEM;
-               goto Error;
-       }
-
-       platform_set_drvdata(dev, info);
-
-       /*
-        * Tell the MTD layer we're not 1:1 mapped so that it does
-        * not attempt to do a direct access on us.
-        */
-       info->map.phys = NO_XIP;
-
-       info->map.size = ixp_data->nr_banks * window_size;
-       info->map.bankwidth = 1;
-
-       /*
-        * map_priv_2 is used to store a ptr to the bank_setup routine
-        */
-       info->map.map_priv_2 = (unsigned long) ixp_data->bank_setup;
-
-       info->map.name = dev_name(&dev->dev);
-       info->map.read = ixp2000_flash_read8;
-       info->map.write = ixp2000_flash_write8;
-       info->map.copy_from = ixp2000_flash_copy_from;
-       info->map.copy_to = ixp2000_flash_copy_to;
-
-       info->res = request_mem_region(dev->resource->start,
-                                      resource_size(dev->resource),
-                                      dev_name(&dev->dev));
-       if (!info->res) {
-               dev_err(&dev->dev, "Could not reserve memory region\n");
-               err = -ENOMEM;
-               goto Error;
-       }
-
-       info->map.map_priv_1 =
-               (unsigned long)ioremap(dev->resource->start,
-                                      resource_size(dev->resource));
-       if (!info->map.map_priv_1) {
-               dev_err(&dev->dev, "Failed to ioremap flash region\n");
-               err = -EIO;
-               goto Error;
-       }
-
-#if defined(__ARMEB__)
-       /*
-        * Enable erratum 44 workaround for NPUs with broken slowport
-        */
-
-       erratum44_workaround = ixp2000_has_broken_slowport();
-       dev_info(&dev->dev, "Erratum 44 workaround %s\n",
-              erratum44_workaround ? "enabled" : "disabled");
-#endif
-
-       info->mtd = do_map_probe(plat->map_name, &info->map);
-       if (!info->mtd) {
-               dev_err(&dev->dev, "map_probe failed\n");
-               err = -ENXIO;
-               goto Error;
-       }
-       info->mtd->owner = THIS_MODULE;
-
-       err = mtd_device_parse_register(info->mtd, probes, NULL, NULL, 0);
-       if (err)
-               goto Error;
-
-       return 0;
-
-Error:
-       ixp2000_flash_remove(dev);
-       return err;
-}
-
-static struct platform_driver ixp2000_flash_driver = {
-       .probe          = ixp2000_flash_probe,
-       .remove         = ixp2000_flash_remove,
-       .driver         = {
-               .name   = "IXP2000-Flash",
-               .owner  = THIS_MODULE,
-       },
-};
-
-module_platform_driver(ixp2000_flash_driver);
-
-MODULE_LICENSE("GPL");
-MODULE_AUTHOR("Deepak Saxena <dsaxena@plexity.net>");
-MODULE_ALIAS("platform:IXP2000-Flash");
diff --git a/drivers/mtd/maps/ixp4xx.c b/drivers/mtd/maps/ixp4xx.c

index e864fc6c58f9ccd900aad7284570ef70ab919220..52b3410a105c943d47ce3b9ea873781948d028a4 100644 (file)
--- a/drivers/mtd/maps/ixp4xx.c
+++ b/drivers/mtd/maps/ixp4xx.c
@@ -148,7 +148,7 @@ struct ixp4xx_flash_info {
         struct resource *res;
  };
  
-static const char *probes[] = { "RedBoot", "cmdlinepart", NULL };
+static const char * const probes[] = { "RedBoot", "cmdlinepart", NULL };
  
  static int ixp4xx_flash_remove(struct platform_device *dev)
  {
diff --git a/drivers/mtd/maps/lantiq-flash.c b/drivers/mtd/maps/lantiq-flash.c

index d1da6ede3845bfdac001dffb036422d1107bdcd9..d7ac65d1d569dee08e325480501e29cc550ef012 100644 (file)
--- a/drivers/mtd/maps/lantiq-flash.c
+++ b/drivers/mtd/maps/lantiq-flash.c
@@ -46,8 +46,7 @@ struct ltq_mtd {
  };
  
  static const char ltq_map_name[] = "ltq_nor";
-static const char *ltq_probe_types[] = {
-                                       "cmdlinepart", "ofpart", NULL };
+static const char * const ltq_probe_types[] = { "cmdlinepart", "ofpart", NULL };
  
  static map_word
  ltq_read16(struct map_info *map, unsigned long adr)
diff --git a/drivers/mtd/maps/mbx860.c b/drivers/mtd/maps/mbx860.c

deleted file mode 100644 (file)

index 93fa56c..0000000
--- a/drivers/mtd/maps/mbx860.c
+++ /dev/null
@@ -1,98 +0,0 @@
-/*
- * Handle mapping of the flash on MBX860 boards
- *
- * Author:     Anton Todorov
- * Copyright:  (C) 2001 Emness Technology
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License version 2 as
- * published by the Free Software Foundation.
- *
- */
-
-#include <linux/module.h>
-#include <linux/types.h>
-#include <linux/kernel.h>
-#include <linux/init.h>
-#include <asm/io.h>
-#include <linux/mtd/mtd.h>
-#include <linux/mtd/map.h>
-#include <linux/mtd/partitions.h>
-
-
-#define WINDOW_ADDR 0xfe000000
-#define WINDOW_SIZE 0x00200000
-
-/* Flash / Partition sizing */
-#define MAX_SIZE_KiB              8192
-#define BOOT_PARTITION_SIZE_KiB    512
-#define KERNEL_PARTITION_SIZE_KiB 5632
-#define APP_PARTITION_SIZE_KiB    2048
-
-#define NUM_PARTITIONS 3
-
-/* partition_info gives details on the logical partitions that the split the
- * single flash device into. If the size if zero we use up to the end of the
- * device. */
-static struct mtd_partition partition_info[]={
-       { .name = "MBX flash BOOT partition",
-       .offset = 0,
-       .size =   BOOT_PARTITION_SIZE_KiB*1024 },
-       { .name = "MBX flash DATA partition",
-       .offset = BOOT_PARTITION_SIZE_KiB*1024,
-       .size = (KERNEL_PARTITION_SIZE_KiB)*1024 },
-       { .name = "MBX flash APPLICATION partition",
-       .offset = (BOOT_PARTITION_SIZE_KiB+KERNEL_PARTITION_SIZE_KiB)*1024 }
-};
-
-
-static struct mtd_info *mymtd;
-
-struct map_info mbx_map = {
-       .name = "MBX flash",
-       .size = WINDOW_SIZE,
-       .phys = WINDOW_ADDR,
-       .bankwidth = 4,
-};
-
-static int __init init_mbx(void)
-{
-       printk(KERN_NOTICE "Motorola MBX flash device: 0x%x at 0x%x\n", WINDOW_SIZE*4, WINDOW_ADDR);
-       mbx_map.virt = ioremap(WINDOW_ADDR, WINDOW_SIZE * 4);
-
-       if (!mbx_map.virt) {
-               printk("Failed to ioremap\n");
-               return -EIO;
-       }
-       simple_map_init(&mbx_map);
-
-       mymtd = do_map_probe("jedec_probe", &mbx_map);
-       if (mymtd) {
-               mymtd->owner = THIS_MODULE;
-               mtd_device_register(mymtd, NULL, 0);
-               mtd_device_register(mymtd, partition_info, NUM_PARTITIONS);
-               return 0;
-       }
-
-       iounmap((void *)mbx_map.virt);
-       return -ENXIO;
-}
-
-static void __exit cleanup_mbx(void)
-{
-       if (mymtd) {
-               mtd_device_unregister(mymtd);
-               map_destroy(mymtd);
-       }
-       if (mbx_map.virt) {
-               iounmap((void *)mbx_map.virt);
-               mbx_map.virt = 0;
-       }
-}
-
-module_init(init_mbx);
-module_exit(cleanup_mbx);
-
-MODULE_AUTHOR("Anton Todorov <a.todorov@emness.com>");
-MODULE_DESCRIPTION("MTD map driver for Motorola MBX860 board");
-MODULE_LICENSE("GPL");
diff --git a/drivers/mtd/maps/pci.c b/drivers/mtd/maps/pci.c

index c3aebd5da5d61046c9f9047148d3d7347a00ec2a..c2604f8b2a5efccd1cde4f77db2be09eb9ce1624 100644 (file)
--- a/drivers/mtd/maps/pci.c
+++ b/drivers/mtd/maps/pci.c
@@ -283,8 +283,7 @@ static int mtd_pci_probe(struct pci_dev *dev, const struct pci_device_id *id)
         if (err)
                 goto release;
  
-       /* tsk - do_map_probe should take const char * */
-       mtd = do_map_probe((char *)info->map_name, &map->map);
+       mtd = do_map_probe(info->map_name, &map->map);
         err = -ENODEV;
         if (!mtd)
                 goto release;
diff --git a/drivers/mtd/maps/physmap.c b/drivers/mtd/maps/physmap.c

index 21b0b713cacb8fd8cb1870b6341ac69c08354d19..e7a592c8c76591e02257e5c39b891aefbf43d9d3 100644 (file)
--- a/drivers/mtd/maps/physmap.c
+++ b/drivers/mtd/maps/physmap.c
@@ -87,21 +87,18 @@ static void physmap_set_vpp(struct map_info *map, int state)
         spin_unlock_irqrestore(&info->vpp_lock, flags);
  }
  
-static const char *rom_probe_types[] = {
-                                       "cfi_probe",
-                                       "jedec_probe",
-                                       "qinfo_probe",
-                                       "map_rom",
-                                       NULL };
-static const char *part_probe_types[] = { "cmdlinepart", "RedBoot", "afs",
-                                         NULL };
+static const char * const rom_probe_types[] = {
+       "cfi_probe", "jedec_probe", "qinfo_probe", "map_rom", NULL };
+
+static const char * const part_probe_types[] = {
+       "cmdlinepart", "RedBoot", "afs", NULL };
  
  static int physmap_flash_probe(struct platform_device *dev)
  {
         struct physmap_flash_data *physmap_data;
         struct physmap_flash_info *info;
-       const char **probe_type;
-       const char **part_types;
+       const char * const *probe_type;
+       const char * const *part_types;
         int err = 0;
         int i;
         int devices_found = 0;
diff --git a/drivers/mtd/maps/physmap_of.c b/drivers/mtd/maps/physmap_of.c

index 363939dfad05afe9eb24320e31130d99b741080f..d11109762ac5ca311a1f9bb90711bca0855ebc83 100644 (file)
--- a/drivers/mtd/maps/physmap_of.c
+++ b/drivers/mtd/maps/physmap_of.c
@@ -71,6 +71,9 @@ static int of_flash_remove(struct platform_device *dev)
         return 0;
  }
  
+static const char * const rom_probe_types[] = {
+       "cfi_probe", "jedec_probe", "map_rom" };
+
  /* Helper function to handle probing of the obsolete "direct-mapped"
   * compatible binding, which has an extra "probe-type" property
   * describing the type of flash probe necessary. */
@@ -80,8 +83,6 @@ static struct mtd_info *obsolete_probe(struct platform_device *dev,
         struct device_node *dp = dev->dev.of_node;
         const char *of_probe;
         struct mtd_info *mtd;
-       static const char *rom_probe_types[]
-               = { "cfi_probe", "jedec_probe", "map_rom"};
         int i;
  
         dev_warn(&dev->dev, "Device tree uses obsolete \"direct-mapped\" "
@@ -111,9 +112,10 @@ static struct mtd_info *obsolete_probe(struct platform_device *dev,
     specifies the list of partition probers to use. If none is given then the
     default is use. These take precedence over other device tree
     information. */
-static const char *part_probe_types_def[] = { "cmdlinepart", "RedBoot",
-                                       "ofpart", "ofoldpart", NULL };
-static const char **of_get_probes(struct device_node *dp)
+static const char * const part_probe_types_def[] = {
+       "cmdlinepart", "RedBoot", "ofpart", "ofoldpart", NULL };
+
+static const char * const *of_get_probes(struct device_node *dp)
  {
         const char *cp;
         int cplen;
@@ -142,7 +144,7 @@ static const char **of_get_probes(struct device_node *dp)
         return res;
  }
  
-static void of_free_probes(const char **probes)
+static void of_free_probes(const char * const *probes)
  {
         if (probes != part_probe_types_def)
                 kfree(probes);
@@ -151,7 +153,7 @@ static void of_free_probes(const char **probes)
  static struct of_device_id of_flash_match[];
  static int of_flash_probe(struct platform_device *dev)
  {
-       const char **part_probe_types;
+       const char * const *part_probe_types;
         const struct of_device_id *match;
         struct device_node *dp = dev->dev.of_node;
         struct resource res;
diff --git a/drivers/mtd/maps/plat-ram.c b/drivers/mtd/maps/plat-ram.c

index 2de66b062f0d733be6e97751312d9f576c198236..71fdda29594b7c3595d786c1572012d09b4df8af 100644 (file)
--- a/drivers/mtd/maps/plat-ram.c
+++ b/drivers/mtd/maps/plat-ram.c
@@ -199,7 +199,7 @@ static int platram_probe(struct platform_device *pdev)
          * supplied by the platform_data struct */
  
         if (pdata->map_probes) {
-               const char **map_probes = pdata->map_probes;
+               const char * const *map_probes = pdata->map_probes;
  
                 for ( ; !info->mtd && *map_probes; map_probes++)
                         info->mtd = do_map_probe(*map_probes , &info->map);
diff --git a/drivers/mtd/maps/pxa2xx-flash.c b/drivers/mtd/maps/pxa2xx-flash.c

index 43e3dbb976d9937e0abf11ad8bab3b08da7f2a26..acb1dbcf7ce58a438ed7ff9a1df9aecb2b57478f 100644 (file)
--- a/drivers/mtd/maps/pxa2xx-flash.c
+++ b/drivers/mtd/maps/pxa2xx-flash.c
@@ -45,9 +45,7 @@ struct pxa2xx_flash_info {
         struct map_info         map;
  };
  
-
-static const char *probes[] = { "RedBoot", "cmdlinepart", NULL };
-
+static const char * const probes[] = { "RedBoot", "cmdlinepart", NULL };
  
  static int pxa2xx_flash_probe(struct platform_device *pdev)
  {
diff --git a/drivers/mtd/maps/rbtx4939-flash.c b/drivers/mtd/maps/rbtx4939-flash.c

index 49c3fe715eee155cfc7519f5327e5f03640bd3be..ac02fbffd6df940f70e3802895811f5916843541 100644 (file)
--- a/drivers/mtd/maps/rbtx4939-flash.c
+++ b/drivers/mtd/maps/rbtx4939-flash.c
@@ -45,14 +45,15 @@ static int rbtx4939_flash_remove(struct platform_device *dev)
         return 0;
  }
  
-static const char *rom_probe_types[] = { "cfi_probe", "jedec_probe", NULL };
+static const char * const rom_probe_types[] = {
+       "cfi_probe", "jedec_probe", NULL };
  
  static int rbtx4939_flash_probe(struct platform_device *dev)
  {
         struct rbtx4939_flash_data *pdata;
         struct rbtx4939_flash_info *info;
         struct resource *res;
-       const char **probe_type;
+       const char * const *probe_type;
         int err = 0;
         unsigned long size;
  
diff --git a/drivers/mtd/maps/rpxlite.c b/drivers/mtd/maps/rpxlite.c

deleted file mode 100644 (file)

index ed88225..0000000
--- a/drivers/mtd/maps/rpxlite.c
+++ /dev/null
@@ -1,64 +0,0 @@
-/*
- * Handle mapping of the flash on the RPX Lite and CLLF boards
- */
-
-#include <linux/module.h>
-#include <linux/types.h>
-#include <linux/kernel.h>
-#include <linux/init.h>
-#include <asm/io.h>
-#include <linux/mtd/mtd.h>
-#include <linux/mtd/map.h>
-
-
-#define WINDOW_ADDR 0xfe000000
-#define WINDOW_SIZE 0x800000
-
-static struct mtd_info *mymtd;
-
-static struct map_info rpxlite_map = {
-       .name = "RPX",
-       .size = WINDOW_SIZE,
-       .bankwidth = 4,
-       .phys = WINDOW_ADDR,
-};
-
-static int __init init_rpxlite(void)
-{
-       printk(KERN_NOTICE "RPX Lite or CLLF flash device: %x at %x\n", WINDOW_SIZE*4, WINDOW_ADDR);
-       rpxlite_map.virt = ioremap(WINDOW_ADDR, WINDOW_SIZE * 4);
-
-       if (!rpxlite_map.virt) {
-               printk("Failed to ioremap\n");
-               return -EIO;
-       }
-       simple_map_init(&rpxlite_map);
-       mymtd = do_map_probe("cfi_probe", &rpxlite_map);
-       if (mymtd) {
-               mymtd->owner = THIS_MODULE;
-               mtd_device_register(mymtd, NULL, 0);
-               return 0;
-       }
-
-       iounmap((void *)rpxlite_map.virt);
-       return -ENXIO;
-}
-
-static void __exit cleanup_rpxlite(void)
-{
-       if (mymtd) {
-               mtd_device_unregister(mymtd);
-               map_destroy(mymtd);
-       }
-       if (rpxlite_map.virt) {
-               iounmap((void *)rpxlite_map.virt);
-               rpxlite_map.virt = 0;
-       }
-}
-
-module_init(init_rpxlite);
-module_exit(cleanup_rpxlite);
-
-MODULE_LICENSE("GPL");
-MODULE_AUTHOR("Arnold Christensen <AKC@pel.dk>");
-MODULE_DESCRIPTION("MTD map driver for RPX Lite and CLLF boards");
diff --git a/drivers/mtd/maps/sa1100-flash.c b/drivers/mtd/maps/sa1100-flash.c

index f694417cf7e694bc3e7f04edef6484ef03449e13..29e3dcaa1d90413b15df9e30c04373d897c2f6c6 100644 (file)
--- a/drivers/mtd/maps/sa1100-flash.c
+++ b/drivers/mtd/maps/sa1100-flash.c
@@ -244,7 +244,7 @@ static struct sa_info *sa1100_setup_mtd(struct platform_device *pdev,
         return ERR_PTR(ret);
  }
  
-static const char *part_probes[] = { "cmdlinepart", "RedBoot", NULL };
+static const char * const part_probes[] = { "cmdlinepart", "RedBoot", NULL };
  
  static int sa1100_mtd_probe(struct platform_device *pdev)
  {
diff --git a/drivers/mtd/maps/solutionengine.c b/drivers/mtd/maps/solutionengine.c

index 9d900ada67084535daa04aa5dd1effb7d9053857..83a7a70915624fbd5ba05a1eaca00a6cf2016d38 100644 (file)
--- a/drivers/mtd/maps/solutionengine.c
+++ b/drivers/mtd/maps/solutionengine.c
@@ -31,7 +31,7 @@ struct map_info soleng_flash_map = {
         .bankwidth = 4,
  };
  
-static const char *probes[] = { "RedBoot", "cmdlinepart", NULL };
+static const char * const probes[] = { "RedBoot", "cmdlinepart", NULL };
  
  #ifdef CONFIG_MTD_SUPERH_RESERVE
  static struct mtd_partition superh_se_partitions[] = {
diff --git a/drivers/mtd/maps/tqm8xxl.c b/drivers/mtd/maps/tqm8xxl.c

deleted file mode 100644 (file)

index d785879..0000000
--- a/drivers/mtd/maps/tqm8xxl.c
+++ /dev/null
@@ -1,249 +0,0 @@
-/*
- * Handle mapping of the flash memory access routines
- * on TQM8xxL based devices.
- *
- * based on rpxlite.c
- *
- * Copyright(C) 2001 Kirk Lee <kirk@hpc.ee.ntu.edu.tw>
- *
- * This code is GPLed
- *
- */
-
-/*
- * According to TQM8xxL hardware manual, TQM8xxL series have
- * following flash memory organisations:
- *     | capacity |    | chip type |   | bank0 |       | bank1 |
- *         2MiB           512Kx16        2MiB             0
- *         4MiB           1Mx16          4MiB             0
- *         8MiB           1Mx16          4MiB             4MiB
- * Thus, we choose CONFIG_MTD_CFI_I2 & CONFIG_MTD_CFI_B4 at
- * kernel configuration.
- */
-#include <linux/module.h>
-#include <linux/types.h>
-#include <linux/kernel.h>
-#include <linux/init.h>
-#include <linux/slab.h>
-
-#include <linux/mtd/mtd.h>
-#include <linux/mtd/map.h>
-#include <linux/mtd/partitions.h>
-
-#include <asm/io.h>
-
-#define FLASH_ADDR 0x40000000
-#define FLASH_SIZE 0x00800000
-#define FLASH_BANK_MAX 4
-
-// trivial struct to describe partition information
-struct mtd_part_def
-{
-       int nums;
-       unsigned char *type;
-       struct mtd_partition* mtd_part;
-};
-
-//static struct mtd_info *mymtd;
-static struct mtd_info* mtd_banks[FLASH_BANK_MAX];
-static struct map_info* map_banks[FLASH_BANK_MAX];
-static struct mtd_part_def part_banks[FLASH_BANK_MAX];
-static unsigned long num_banks;
-static void __iomem *start_scan_addr;
-
-/*
- * Here are partition information for all known TQM8xxL series devices.
- * See include/linux/mtd/partitions.h for definition of the mtd_partition
- * structure.
- *
- * The *_max_flash_size is the maximum possible mapped flash size which
- * is not necessarily the actual flash size.  It must correspond to the
- * value specified in the mapping definition defined by the
- * "struct map_desc *_io_desc" for the corresponding machine.
- */
-
-/* Currently, TQM8xxL has up to 8MiB flash */
-static unsigned long tqm8xxl_max_flash_size = 0x00800000;
-
-/* partition definition for first flash bank
- * (cf. "drivers/char/flash_config.c")
- */
-static struct mtd_partition tqm8xxl_partitions[] = {
-       {
-         .name = "ppcboot",
-         .offset = 0x00000000,
-         .size = 0x00020000,           /* 128KB           */
-         .mask_flags = MTD_WRITEABLE,  /* force read-only */
-       },
-       {
-         .name = "kernel",             /* default kernel image */
-         .offset = 0x00020000,
-         .size = 0x000e0000,
-         .mask_flags = MTD_WRITEABLE,  /* force read-only */
-       },
-       {
-         .name = "user",
-         .offset = 0x00100000,
-         .size = 0x00100000,
-       },
-       {
-         .name = "initrd",
-         .offset = 0x00200000,
-         .size = 0x00200000,
-       }
-};
-/* partition definition for second flash bank */
-static struct mtd_partition tqm8xxl_fs_partitions[] = {
-       {
-         .name = "cramfs",
-         .offset = 0x00000000,
-         .size = 0x00200000,
-       },
-       {
-         .name = "jffs",
-         .offset = 0x00200000,
-         .size = 0x00200000,
-         //.size = MTDPART_SIZ_FULL,
-       }
-};
-
-static int __init init_tqm_mtd(void)
-{
-       int idx = 0, ret = 0;
-       unsigned long flash_addr, flash_size, mtd_size = 0;
-       /* pointer to TQM8xxL board info data */
-       bd_t *bd = (bd_t *)__res;
-
-       flash_addr = bd->bi_flashstart;
-       flash_size = bd->bi_flashsize;
-
-       //request maximum flash size address space
-       start_scan_addr = ioremap(flash_addr, flash_size);
-       if (!start_scan_addr) {
-               printk(KERN_WARNING "%s:Failed to ioremap address:0x%x\n", __func__, flash_addr);
-               return -EIO;
-       }
-
-       for (idx = 0 ; idx < FLASH_BANK_MAX ; idx++) {
-               if(mtd_size >= flash_size)
-                       break;
-
-               printk(KERN_INFO "%s: chip probing count %d\n", __func__, idx);
-
-               map_banks[idx] = kzalloc(sizeof(struct map_info), GFP_KERNEL);
-               if(map_banks[idx] == NULL) {
-                       ret = -ENOMEM;
-                       /* FIXME: What if some MTD devices were probed already? */
-                       goto error_mem;
-               }
-
-               map_banks[idx]->name = kmalloc(16, GFP_KERNEL);
-
-               if (!map_banks[idx]->name) {
-                       ret = -ENOMEM;
-                       /* FIXME: What if some MTD devices were probed already? */
-                       goto error_mem;
-               }
-               sprintf(map_banks[idx]->name, "TQM8xxL%d", idx);
-
-               map_banks[idx]->size = flash_size;
-               map_banks[idx]->bankwidth = 4;
-
-               simple_map_init(map_banks[idx]);
-
-               map_banks[idx]->virt = start_scan_addr;
-               map_banks[idx]->phys = flash_addr;
-               /* FIXME: This looks utterly bogus, but I'm trying to
-                  preserve the behaviour of the original (shown here)...
-
-               map_banks[idx]->map_priv_1 =
-               start_scan_addr + ((idx > 0) ?
-               (mtd_banks[idx-1] ? mtd_banks[idx-1]->size : 0) : 0);
-               */
-
-               if (idx && mtd_banks[idx-1]) {
-                       map_banks[idx]->virt += mtd_banks[idx-1]->size;
-                       map_banks[idx]->phys += mtd_banks[idx-1]->size;
-               }
-
-               //start to probe flash chips
-               mtd_banks[idx] = do_map_probe("cfi_probe", map_banks[idx]);
-
-               if (mtd_banks[idx]) {
-                       mtd_banks[idx]->owner = THIS_MODULE;
-                       mtd_size += mtd_banks[idx]->size;
-                       num_banks++;
-
-                       printk(KERN_INFO "%s: bank%d, name:%s, size:%dbytes \n", __func__, num_banks,
-                       mtd_banks[idx]->name, mtd_banks[idx]->size);
-               }
-       }
-
-       /* no supported flash chips found */
-       if (!num_banks) {
-               printk(KERN_NOTICE "TQM8xxL: No support flash chips found!\n");
-               ret = -ENXIO;
-               goto error_mem;
-       }
-
-       /*
-        * Select Static partition definitions
-        */
-       part_banks[0].mtd_part = tqm8xxl_partitions;
-       part_banks[0].type = "Static image";
-       part_banks[0].nums = ARRAY_SIZE(tqm8xxl_partitions);
-
-       part_banks[1].mtd_part = tqm8xxl_fs_partitions;
-       part_banks[1].type = "Static file system";
-       part_banks[1].nums = ARRAY_SIZE(tqm8xxl_fs_partitions);
-
-       for(idx = 0; idx < num_banks ; idx++) {
-               if (part_banks[idx].nums == 0)
-                       printk(KERN_NOTICE "TQM flash%d: no partition info available, registering whole flash at once\n", idx);
-               else
-                       printk(KERN_NOTICE "TQM flash%d: Using %s partition definition\n",
-                                       idx, part_banks[idx].type);
-               mtd_device_register(mtd_banks[idx], part_banks[idx].mtd_part,
-               part_banks[idx].nums);
-       }
-       return 0;
-error_mem:
-       for(idx = 0 ; idx < FLASH_BANK_MAX ; idx++) {
-               if(map_banks[idx] != NULL) {
-                       kfree(map_banks[idx]->name);
-                       map_banks[idx]->name = NULL;
-                       kfree(map_banks[idx]);
-                       map_banks[idx] = NULL;
-               }
-       }
-error:
-       iounmap(start_scan_addr);
-       return ret;
-}
-
-static void __exit cleanup_tqm_mtd(void)
-{
-       unsigned int idx = 0;
-       for(idx = 0 ; idx < num_banks ; idx++) {
-               /* destroy mtd_info previously allocated */
-               if (mtd_banks[idx]) {
-                       mtd_device_unregister(mtd_banks[idx]);
-                       map_destroy(mtd_banks[idx]);
-               }
-               /* release map_info not used anymore */
-               kfree(map_banks[idx]->name);
-               kfree(map_banks[idx]);
-       }
-
-       if (start_scan_addr) {
-               iounmap(start_scan_addr);
-               start_scan_addr = 0;
-       }
-}
-
-module_init(init_tqm_mtd);
-module_exit(cleanup_tqm_mtd);
-
-MODULE_LICENSE("GPL");
-MODULE_AUTHOR("Kirk Lee <kirk@hpc.ee.ntu.edu.tw>");
-MODULE_DESCRIPTION("MTD map driver for TQM8xxL boards");
diff --git a/drivers/mtd/maps/tsunami_flash.c b/drivers/mtd/maps/tsunami_flash.c

index 1de390e1c2fb3039b98100014e21ac2e6563ba4c..da2cdb5fd6db98862dba7a0fe7c84ca6abd563af 100644 (file)
--- a/drivers/mtd/maps/tsunami_flash.c
+++ b/drivers/mtd/maps/tsunami_flash.c
@@ -82,11 +82,12 @@ static void __exit  cleanup_tsunami_flash(void)
         tsunami_flash_mtd = 0;
  }
  
+static const char * const rom_probe_types[] = {
+       "cfi_probe", "jedec_probe", "map_rom", NULL };
  
  static int __init init_tsunami_flash(void)
  {
-       static const char *rom_probe_types[] = { "cfi_probe", "jedec_probe", "map_rom", NULL };
-       char **type;
+       const char * const *type;
  
         tsunami_tig_writeb(FLASH_ENABLE_BYTE, FLASH_ENABLE_PORT);
  
diff --git a/drivers/mtd/mtd_blkdevs.c b/drivers/mtd/mtd_blkdevs.c

index 5ad39bb5ab4c02659cacb1fdb82e8a5b30188759..5073cbc796d86c3a04c4f22d83b01c2f01b1d114 100644 (file)
--- a/drivers/mtd/mtd_blkdevs.c
+++ b/drivers/mtd/mtd_blkdevs.c
@@ -237,13 +237,12 @@ error_put:
         return ret;
  }
  
-static int blktrans_release(struct gendisk *disk, fmode_t mode)
+static void blktrans_release(struct gendisk *disk, fmode_t mode)
  {
         struct mtd_blktrans_dev *dev = blktrans_dev_get(disk);
-       int ret = 0;
  
         if (!dev)
-               return ret;
+               return;
  
         mutex_lock(&dev->lock);
  
@@ -254,13 +253,13 @@ static int blktrans_release(struct gendisk *disk, fmode_t mode)
         module_put(dev->tr->owner);
  
         if (dev->mtd) {
-               ret = dev->tr->release ? dev->tr->release(dev) : 0;
+               if (dev->tr->release)
+                       dev->tr->release(dev);
                 __put_mtd_device(dev->mtd);
         }
  unlock:
         mutex_unlock(&dev->lock);
         blktrans_dev_put(dev);
-       return ret;
  }
  
  static int blktrans_getgeo(struct block_device *bdev, struct hd_geometry *geo)
diff --git a/drivers/mtd/mtdblock.c b/drivers/mtd/mtdblock.c

index 6c6d80736fadfb4013871b31795e17354a4d671e..2aef5dda522be57cbd6652963d4078966f7bc64e 100644 (file)
--- a/drivers/mtd/mtdblock.c
+++ b/drivers/mtd/mtdblock.c
@@ -308,7 +308,7 @@ static int mtdblock_open(struct mtd_blktrans_dev *mbd)
         return 0;
  }
  
-static int mtdblock_release(struct mtd_blktrans_dev *mbd)
+static void mtdblock_release(struct mtd_blktrans_dev *mbd)
  {
         struct mtdblk_dev *mtdblk = container_of(mbd, struct mtdblk_dev, mbd);
  
@@ -333,8 +333,6 @@ static int mtdblock_release(struct mtd_blktrans_dev *mbd)
         mutex_unlock(&mtdblks_lock);
  
         pr_debug("ok\n");
-
-       return 0;
  }
  
  static int mtdblock_flush(struct mtd_blktrans_dev *dev)
diff --git a/drivers/mtd/mtdchar.c b/drivers/mtd/mtdchar.c

index dc571ebc1aa0be8680f4a20397653e9d4a7f84cf..c719879284bd7b9921a254a70831a39bae644a7d 100644 (file)
--- a/drivers/mtd/mtdchar.c
+++ b/drivers/mtd/mtdchar.c
@@ -38,6 +38,8 @@
  
  #include <asm/uaccess.h>
  
+#include "mtdcore.h"
+
  static DEFINE_MUTEX(mtd_mutex);
  
  /*
@@ -365,37 +367,35 @@ static void mtdchar_erase_callback (struct erase_info *instr)
         wake_up((wait_queue_head_t *)instr->priv);
  }
  
-#ifdef CONFIG_HAVE_MTD_OTP
  static int otp_select_filemode(struct mtd_file_info *mfi, int mode)
  {
         struct mtd_info *mtd = mfi->mtd;
         size_t retlen;
-       int ret = 0;
-
-       /*
-        * Make a fake call to mtd_read_fact_prot_reg() to check if OTP
-        * operations are supported.
-        */
-       if (mtd_read_fact_prot_reg(mtd, -1, 0, &retlen, NULL) == -EOPNOTSUPP)
-               return -EOPNOTSUPP;
  
         switch (mode) {
         case MTD_OTP_FACTORY:
+               if (mtd_read_fact_prot_reg(mtd, -1, 0, &retlen, NULL) ==
+                               -EOPNOTSUPP)
+                       return -EOPNOTSUPP;
+
                 mfi->mode = MTD_FILE_MODE_OTP_FACTORY;
                 break;
         case MTD_OTP_USER:
+               if (mtd_read_user_prot_reg(mtd, -1, 0, &retlen, NULL) ==
+                               -EOPNOTSUPP)
+                       return -EOPNOTSUPP;
+
                 mfi->mode = MTD_FILE_MODE_OTP_USER;
                 break;
-       default:
-               ret = -EINVAL;
         case MTD_OTP_OFF:
+               mfi->mode = MTD_FILE_MODE_NORMAL;
                 break;
+       default:
+               return -EINVAL;
         }
-       return ret;
+
+       return 0;
  }
-#else
-# define otp_select_filemode(f,m)      -EOPNOTSUPP
-#endif
  
  static int mtdchar_writeoob(struct file *file, struct mtd_info *mtd,
         uint64_t start, uint32_t length, void __user *ptr,
@@ -888,7 +888,6 @@ static int mtdchar_ioctl(struct file *file, u_int cmd, u_long arg)
                 break;
         }
  
-#ifdef CONFIG_HAVE_MTD_OTP
         case OTPSELECT:
         {
                 int mode;
@@ -944,7 +943,6 @@ static int mtdchar_ioctl(struct file *file, u_int cmd, u_long arg)
                 ret = mtd_lock_user_prot_reg(mtd, oinfo.start, oinfo.length);
                 break;
         }
-#endif
  
         /* This ioctl is being deprecated - it truncates the ECC layout */
         case ECCGETLAYOUT:
@@ -1185,23 +1183,25 @@ static struct file_system_type mtd_inodefs_type = {
  };
  MODULE_ALIAS_FS("mtd_inodefs");
  
-static int __init init_mtdchar(void)
+int __init init_mtdchar(void)
  {
         int ret;
  
         ret = __register_chrdev(MTD_CHAR_MAJOR, 0, 1 << MINORBITS,
                                    "mtd", &mtd_fops);
         if (ret < 0) {
-               pr_notice("Can't allocate major number %d for "
-                               "Memory Technology Devices.\n", MTD_CHAR_MAJOR);
+               pr_err("Can't allocate major number %d for MTD\n",
+                      MTD_CHAR_MAJOR);
                 return ret;
         }
  
         ret = register_filesystem(&mtd_inodefs_type);
         if (ret) {
-               pr_notice("Can't register mtd_inodefs filesystem: %d\n", ret);
+               pr_err("Can't register mtd_inodefs filesystem, error %d\n",
+                      ret);
                 goto err_unregister_chdev;
         }
+
         return ret;
  
  err_unregister_chdev:
@@ -1209,18 +1209,10 @@ err_unregister_chdev:
         return ret;
  }
  
-static void __exit cleanup_mtdchar(void)
+void __exit cleanup_mtdchar(void)
  {
         unregister_filesystem(&mtd_inodefs_type);
         __unregister_chrdev(MTD_CHAR_MAJOR, 0, 1 << MINORBITS, "mtd");
  }
  
-module_init(init_mtdchar);
-module_exit(cleanup_mtdchar);
-
-MODULE_ALIAS_CHARDEV_MAJOR(MTD_CHAR_MAJOR);
-
-MODULE_LICENSE("GPL");
-MODULE_AUTHOR("David Woodhouse <dwmw2@infradead.org>");
-MODULE_DESCRIPTION("Direct character-device access to MTD devices");
  MODULE_ALIAS_CHARDEV_MAJOR(MTD_CHAR_MAJOR);
diff --git a/drivers/mtd/mtdcore.c b/drivers/mtd/mtdcore.c

index 322ca65b0cc59fe5fd0dcf4c13ce71f9f91e2753..c400c57c394aaddb26147bfe2b1955c6716f74d5 100644 (file)
--- a/drivers/mtd/mtdcore.c
+++ b/drivers/mtd/mtdcore.c
@@ -42,6 +42,7 @@
  #include <linux/mtd/partitions.h>
  
  #include "mtdcore.h"
+
  /*
   * backing device capabilities for non-mappable devices (such as NAND flash)
   * - permits private mappings, copies are taken of the data
@@ -97,11 +98,7 @@ EXPORT_SYMBOL_GPL(__mtd_next_device);
  static LIST_HEAD(mtd_notifiers);
  
  
-#if defined(CONFIG_MTD_CHAR) || defined(CONFIG_MTD_CHAR_MODULE)
  #define MTD_DEVT(index) MKDEV(MTD_CHAR_MAJOR, (index)*2)
-#else
-#define MTD_DEVT(index) 0
-#endif
  
  /* REVISIT once MTD uses the driver model better, whoever allocates
   * the mtd_info will probably want to use the release() hook...
@@ -493,7 +490,7 @@ out_error:
   *
   * Returns zero in case of success and a negative error code in case of failure.
   */
-int mtd_device_parse_register(struct mtd_info *mtd, const char **types,
+int mtd_device_parse_register(struct mtd_info *mtd, const char * const *types,
                               struct mtd_part_parser_data *parser_data,
                               const struct mtd_partition *parts,
                               int nr_parts)
@@ -1117,8 +1114,6 @@ EXPORT_SYMBOL_GPL(mtd_kmalloc_up_to);
  /*====================================================================*/
  /* Support for /proc/mtd */
  
-static struct proc_dir_entry *proc_mtd;
-
  static int mtd_proc_show(struct seq_file *m, void *v)
  {
         struct mtd_info *mtd;
@@ -1164,6 +1159,8 @@ static int __init mtd_bdi_init(struct backing_dev_info *bdi, const char *name)
         return ret;
  }
  
+static struct proc_dir_entry *proc_mtd;
+
  static int __init init_mtd(void)
  {
         int ret;
@@ -1184,11 +1181,17 @@ static int __init init_mtd(void)
         if (ret)
                 goto err_bdi3;
  
-#ifdef CONFIG_PROC_FS
         proc_mtd = proc_create("mtd", 0, NULL, &mtd_proc_ops);
-#endif /* CONFIG_PROC_FS */
+
+       ret = init_mtdchar();
+       if (ret)
+               goto out_procfs;
+
         return 0;
  
+out_procfs:
+       if (proc_mtd)
+               remove_proc_entry("mtd", NULL);
  err_bdi3:
         bdi_destroy(&mtd_bdi_ro_mappable);
  err_bdi2:
@@ -1202,10 +1205,9 @@ err_reg:
  
  static void __exit cleanup_mtd(void)
  {
-#ifdef CONFIG_PROC_FS
+       cleanup_mtdchar();
         if (proc_mtd)
-               remove_proc_entry( "mtd", NULL);
-#endif /* CONFIG_PROC_FS */
+               remove_proc_entry("mtd", NULL);
         class_unregister(&mtd_class);
         bdi_destroy(&mtd_bdi_unmappable);
         bdi_destroy(&mtd_bdi_ro_mappable);
diff --git a/drivers/mtd/mtdcore.h b/drivers/mtd/mtdcore.h

index 961a38408542b70df255a8ba05871a2ffb43a3fa..7b0353399a10642f44ca4d15994942066693e9b2 100644 (file)
--- a/drivers/mtd/mtdcore.h
+++ b/drivers/mtd/mtdcore.h
@@ -1,23 +1,21 @@
-/* linux/drivers/mtd/mtdcore.h
- *
- * Header file for driver private mtdcore exports
- *
+/*
+ * These are exported solely for the purpose of mtd_blkdevs.c and mtdchar.c.
+ * You should not use them for _anything_ else.
   */
  
-/* These are exported solely for the purpose of mtd_blkdevs.c. You
-   should not use them for _anything_ else */
-
  extern struct mutex mtd_table_mutex;
-extern struct mtd_info *__mtd_next_device(int i);
  
-extern int add_mtd_device(struct mtd_info *mtd);
-extern int del_mtd_device(struct mtd_info *mtd);
-extern int add_mtd_partitions(struct mtd_info *, const struct mtd_partition *,
-                             int);
-extern int del_mtd_partitions(struct mtd_info *);
-extern int parse_mtd_partitions(struct mtd_info *master, const char **types,
-                               struct mtd_partition **pparts,
-                               struct mtd_part_parser_data *data);
+struct mtd_info *__mtd_next_device(int i);
+int add_mtd_device(struct mtd_info *mtd);
+int del_mtd_device(struct mtd_info *mtd);
+int add_mtd_partitions(struct mtd_info *, const struct mtd_partition *, int);
+int del_mtd_partitions(struct mtd_info *);
+int parse_mtd_partitions(struct mtd_info *master, const char * const *types,
+                        struct mtd_partition **pparts,
+                        struct mtd_part_parser_data *data);
+
+int __init init_mtdchar(void);
+void __exit cleanup_mtdchar(void);
  
  #define mtd_for_each_device(mtd)                       \
         for ((mtd) = __mtd_next_device(0);              \
diff --git a/drivers/mtd/mtdpart.c b/drivers/mtd/mtdpart.c

index 70fa70a8318f4584c473a7fabd35a7bc4fad6027..301493382cd0a27a0df3fe0b5285fdb6a8cbb5f0 100644 (file)
--- a/drivers/mtd/mtdpart.c
+++ b/drivers/mtd/mtdpart.c
@@ -694,7 +694,7 @@ EXPORT_SYMBOL_GPL(deregister_mtd_parser);
   * Do not forget to update 'parse_mtd_partitions()' kerneldoc comment if you
   * are changing this array!
   */
-static const char *default_mtd_part_types[] = {
+static const char * const default_mtd_part_types[] = {
         "cmdlinepart",
         "ofpart",
         NULL
@@ -720,7 +720,7 @@ static const char *default_mtd_part_types[] = {
   * o a positive number of found partitions, in which case on exit @pparts will
   *   point to an array containing this number of &struct mtd_info objects.
   */
-int parse_mtd_partitions(struct mtd_info *master, const char **types,
+int parse_mtd_partitions(struct mtd_info *master, const char *const *types,
                          struct mtd_partition **pparts,
                          struct mtd_part_parser_data *data)
  {
diff --git a/drivers/mtd/nand/Kconfig b/drivers/mtd/nand/Kconfig

index 81bf5e52601e00177ef6b20a84c7536643d42594..a60f6c17f57b62acd19528b62a7d1b329df12913 100644 (file)
--- a/drivers/mtd/nand/Kconfig
+++ b/drivers/mtd/nand/Kconfig
@@ -41,14 +41,6 @@ config MTD_SM_COMMON
         tristate
         default n
  
-config MTD_NAND_MUSEUM_IDS
-       bool "Enable chip ids for obsolete ancient NAND devices"
-       default n
-       help
-         Enable this option only when your board has first generation
-         NAND chips (page size 256 byte, erase size 4-8KiB). The IDs
-         of these chips were reused by later, larger chips.
-
  config MTD_NAND_DENALI
          tristate "Support Denali NAND controller"
          help
@@ -81,15 +73,9 @@ config MTD_NAND_DENALI_SCRATCH_REG_ADDR
            scratch register here to enable this feature. On Intel Moorestown
            boards, the scratch register is at 0xFF108018.
  
-config MTD_NAND_H1900
-       tristate "iPAQ H1900 flash"
-       depends on ARCH_PXA && BROKEN
-       help
-         This enables the driver for the iPAQ h1900 flash.
-
  config MTD_NAND_GPIO
         tristate "GPIO NAND Flash driver"
-       depends on GENERIC_GPIO && ARM
+       depends on GPIOLIB && ARM
         help
           This enables a GPIO based NAND flash driver.
  
@@ -201,22 +187,6 @@ config MTD_NAND_BF5XX_BOOTROM_ECC
  
           If unsure, say N.
  
-config MTD_NAND_RTC_FROM4
-       tristate "Renesas Flash ROM 4-slot interface board (FROM_BOARD4)"
-       depends on SH_SOLUTION_ENGINE
-       select REED_SOLOMON
-       select REED_SOLOMON_DEC8
-       select BITREVERSE
-       help
-         This enables the driver for the Renesas Technology AG-AND
-         flash interface board (FROM_BOARD4)
-
-config MTD_NAND_PPCHAMELEONEVB
-       tristate "NAND Flash device on PPChameleonEVB board"
-       depends on PPCHAMELEONEVB && BROKEN
-       help
-         This enables the NAND flash driver on the PPChameleon EVB Board.
-
  config MTD_NAND_S3C2410
         tristate "NAND Flash support for Samsung S3C SoCs"
         depends on ARCH_S3C24XX || ARCH_S3C64XX
diff --git a/drivers/mtd/nand/Makefile b/drivers/mtd/nand/Makefile

index d76d912056916a327bcf7ecace6df9dcd06ea97c..bb8189172f62f49871573bb6119555fce1697654 100644 (file)
--- a/drivers/mtd/nand/Makefile
+++ b/drivers/mtd/nand/Makefile
@@ -15,14 +15,11 @@ obj-$(CONFIG_MTD_NAND_DENALI_PCI)   += denali_pci.o
  obj-$(CONFIG_MTD_NAND_DENALI_DT)       += denali_dt.o
  obj-$(CONFIG_MTD_NAND_AU1550)          += au1550nd.o
  obj-$(CONFIG_MTD_NAND_BF5XX)           += bf5xx_nand.o
-obj-$(CONFIG_MTD_NAND_PPCHAMELEONEVB)  += ppchameleonevb.o
  obj-$(CONFIG_MTD_NAND_S3C2410)         += s3c2410.o
  obj-$(CONFIG_MTD_NAND_DAVINCI)         += davinci_nand.o
  obj-$(CONFIG_MTD_NAND_DISKONCHIP)      += diskonchip.o
  obj-$(CONFIG_MTD_NAND_DOCG4)           += docg4.o
  obj-$(CONFIG_MTD_NAND_FSMC)            += fsmc_nand.o
-obj-$(CONFIG_MTD_NAND_H1900)           += h1910.o
-obj-$(CONFIG_MTD_NAND_RTC_FROM4)       += rtc_from4.o
  obj-$(CONFIG_MTD_NAND_SHARPSL)         += sharpsl.o
  obj-$(CONFIG_MTD_NAND_NANDSIM)         += nandsim.o
  obj-$(CONFIG_MTD_NAND_CS553X)          += cs553x_nand.o
diff --git a/drivers/mtd/nand/atmel_nand.c b/drivers/mtd/nand/atmel_nand.c

index ffcbcca2fd2dbc67627005c177750ef5c35e5da2..2d23d2929438053a255ccad183e111ac7fe633ba 100644 (file)
--- a/drivers/mtd/nand/atmel_nand.c
+++ b/drivers/mtd/nand/atmel_nand.c
@@ -1737,20 +1737,7 @@ static struct platform_driver atmel_nand_driver = {
         },
  };
  
-static int __init atmel_nand_init(void)
-{
-       return platform_driver_probe(&atmel_nand_driver, atmel_nand_probe);
-}
-
-
-static void __exit atmel_nand_exit(void)
-{
-       platform_driver_unregister(&atmel_nand_driver);
-}
-
-
-module_init(atmel_nand_init);
-module_exit(atmel_nand_exit);
+module_platform_driver_probe(atmel_nand_driver, atmel_nand_probe);
  
  MODULE_LICENSE("GPL");
  MODULE_AUTHOR("Rick Bronson");
diff --git a/drivers/mtd/nand/bf5xx_nand.c b/drivers/mtd/nand/bf5xx_nand.c

index 4271e948d1e255b149ce6d5c8c64c38b752ef472..776df3694f755865f88b7e0a499cac533dd04142 100644 (file)
--- a/drivers/mtd/nand/bf5xx_nand.c
+++ b/drivers/mtd/nand/bf5xx_nand.c
@@ -874,21 +874,7 @@ static struct platform_driver bf5xx_nand_driver = {
         },
  };
  
-static int __init bf5xx_nand_init(void)
-{
-       printk(KERN_INFO "%s, Version %s (c) 2007 Analog Devices, Inc.\n",
-               DRV_DESC, DRV_VERSION);
-
-       return platform_driver_register(&bf5xx_nand_driver);
-}
-
-static void __exit bf5xx_nand_exit(void)
-{
-       platform_driver_unregister(&bf5xx_nand_driver);
-}
-
-module_init(bf5xx_nand_init);
-module_exit(bf5xx_nand_exit);
+module_platform_driver(bf5xx_nand_driver);
  
  MODULE_LICENSE("GPL");
  MODULE_AUTHOR(DRV_AUTHOR);
diff --git a/drivers/mtd/nand/cafe_nand.c b/drivers/mtd/nand/cafe_nand.c

index 010d612665363632d47f9d0bc75bafdb49c2c9dd..c34985a55101b0606f76ed580f1e13ac562913e2 100644 (file)
--- a/drivers/mtd/nand/cafe_nand.c
+++ b/drivers/mtd/nand/cafe_nand.c
@@ -303,13 +303,7 @@ static void cafe_nand_cmdfunc(struct mtd_info *mtd, unsigned command,
         case NAND_CMD_SEQIN:
         case NAND_CMD_RNDIN:
         case NAND_CMD_STATUS:
-       case NAND_CMD_DEPLETE1:
         case NAND_CMD_RNDOUT:
-       case NAND_CMD_STATUS_ERROR:
-       case NAND_CMD_STATUS_ERROR0:
-       case NAND_CMD_STATUS_ERROR1:
-       case NAND_CMD_STATUS_ERROR2:
-       case NAND_CMD_STATUS_ERROR3:
                 cafe_writel(cafe, cafe->ctl2, NAND_CTRL2);
                 return;
         }
@@ -536,8 +530,8 @@ static int cafe_nand_write_page_lowlevel(struct mtd_info *mtd,
  }
  
  static int cafe_nand_write_page(struct mtd_info *mtd, struct nand_chip *chip,
-                               const uint8_t *buf, int oob_required, int page,
-                               int cached, int raw)
+                       uint32_t offset, int data_len, const uint8_t *buf,
+                       int oob_required, int page, int cached, int raw)
  {
         int status;
  
diff --git a/drivers/mtd/nand/davinci_nand.c b/drivers/mtd/nand/davinci_nand.c

index 94e17af8e4503b423d17f7f0f155d9af928bde90..c3e15a55817349eb4ed2e1c86009b042c38e2405 100644 (file)
--- a/drivers/mtd/nand/davinci_nand.c
+++ b/drivers/mtd/nand/davinci_nand.c
@@ -34,6 +34,7 @@
  #include <linux/mtd/partitions.h>
  #include <linux/slab.h>
  #include <linux/of_device.h>
+#include <linux/of.h>
  
  #include <linux/platform_data/mtd-davinci.h>
  #include <linux/platform_data/mtd-davinci-aemif.h>
@@ -577,7 +578,6 @@ static struct davinci_nand_pdata
         return pdev->dev.platform_data;
  }
  #else
-#define davinci_nand_of_match NULL
  static struct davinci_nand_pdata
         *nand_davinci_get_pdata(struct platform_device *pdev)
  {
@@ -878,22 +878,12 @@ static struct platform_driver nand_davinci_driver = {
         .driver         = {
                 .name   = "davinci_nand",
                 .owner  = THIS_MODULE,
-               .of_match_table = davinci_nand_of_match,
+               .of_match_table = of_match_ptr(davinci_nand_of_match),
         },
  };
  MODULE_ALIAS("platform:davinci_nand");
  
-static int __init nand_davinci_init(void)
-{
-       return platform_driver_probe(&nand_davinci_driver, nand_davinci_probe);
-}
-module_init(nand_davinci_init);
-
-static void __exit nand_davinci_exit(void)
-{
-       platform_driver_unregister(&nand_davinci_driver);
-}
-module_exit(nand_davinci_exit);
+module_platform_driver_probe(nand_davinci_driver, nand_davinci_probe);
  
  MODULE_LICENSE("GPL");
  MODULE_AUTHOR("Texas Instruments");
diff --git a/drivers/mtd/nand/denali_dt.c b/drivers/mtd/nand/denali_dt.c

index 546f8cb5688d52ce80a10df1cd82810b256a9adb..92530244e2cbfdf48ecc2aa2282b90d12a98d0ad 100644 (file)
--- a/drivers/mtd/nand/denali_dt.c
+++ b/drivers/mtd/nand/denali_dt.c
@@ -42,7 +42,7 @@ static void __iomem *request_and_map(struct device *dev,
         }
  
         ptr = devm_ioremap_nocache(dev, res->start, resource_size(res));
-       if (!res)
+       if (!ptr)
                 dev_err(dev, "ioremap_nocache of %s failed!", res->name);
  
         return ptr;
@@ -90,7 +90,7 @@ static int denali_dt_probe(struct platform_device *ofdev)
         denali->irq = platform_get_irq(ofdev, 0);
         if (denali->irq < 0) {
                 dev_err(&ofdev->dev, "no irq defined\n");
-               return -ENXIO;
+               return denali->irq;
         }
  
         denali->flash_reg = request_and_map(&ofdev->dev, denali_reg);
@@ -146,21 +146,11 @@ static struct platform_driver denali_dt_driver = {
         .driver         = {
                 .name   = "denali-nand-dt",
                 .owner  = THIS_MODULE,
-               .of_match_table = of_match_ptr(denali_nand_dt_ids),
+               .of_match_table = denali_nand_dt_ids,
         },
  };
  
-static int __init denali_init_dt(void)
-{
-       return platform_driver_register(&denali_dt_driver);
-}
-module_init(denali_init_dt);
-
-static void __exit denali_exit_dt(void)
-{
-       platform_driver_unregister(&denali_dt_driver);
-}
-module_exit(denali_exit_dt);
+module_platform_driver(denali_dt_driver);
  
  MODULE_LICENSE("GPL");
  MODULE_AUTHOR("Jamie Iles");
diff --git a/drivers/mtd/nand/docg4.c b/drivers/mtd/nand/docg4.c

index 18fa4489e52e6687046815b9ed204b4807ebb64c..fa25e7a08134d1cc6bf3d0a68eb15da42a44524b 100644 (file)
--- a/drivers/mtd/nand/docg4.c
+++ b/drivers/mtd/nand/docg4.c
@@ -1397,18 +1397,7 @@ static struct platform_driver docg4_driver = {
         .remove         = __exit_p(cleanup_docg4),
  };
  
-static int __init docg4_init(void)
-{
-       return platform_driver_probe(&docg4_driver, probe_docg4);
-}
-
-static void __exit docg4_exit(void)
-{
-       platform_driver_unregister(&docg4_driver);
-}
-
-module_init(docg4_init);
-module_exit(docg4_exit);
+module_platform_driver_probe(docg4_driver, probe_docg4);
  
  MODULE_LICENSE("GPL");
  MODULE_AUTHOR("Mike Dunn");
diff --git a/drivers/mtd/nand/fsmc_nand.c b/drivers/mtd/nand/fsmc_nand.c

index 05ba3f0c2d194ee5c6a0d745f0a78b6744f23486..911e2433fe304b107f1ad8b585621d653578e302 100644 (file)
--- a/drivers/mtd/nand/fsmc_nand.c
+++ b/drivers/mtd/nand/fsmc_nand.c
@@ -1235,18 +1235,7 @@ static struct platform_driver fsmc_nand_driver = {
         },
  };
  
-static int __init fsmc_nand_init(void)
-{
-       return platform_driver_probe(&fsmc_nand_driver,
-                                    fsmc_nand_probe);
-}
-module_init(fsmc_nand_init);
-
-static void __exit fsmc_nand_exit(void)
-{
-       platform_driver_unregister(&fsmc_nand_driver);
-}
-module_exit(fsmc_nand_exit);
+module_platform_driver_probe(fsmc_nand_driver, fsmc_nand_probe);
  
  MODULE_LICENSE("GPL");
  MODULE_AUTHOR("Vipin Kumar <vipin.kumar@st.com>, Ashish Priyadarshi");
diff --git a/drivers/mtd/nand/gpio.c b/drivers/mtd/nand/gpio.c

index e789e3f517109402ac35d26fce9e778b7366c3fd..89065dd83d64d7ad64a6d83ca318f36ae65ec6cf 100644 (file)
--- a/drivers/mtd/nand/gpio.c
+++ b/drivers/mtd/nand/gpio.c
@@ -190,7 +190,6 @@ static struct resource *gpio_nand_get_io_sync_of(struct platform_device *pdev)
         return r;
  }
  #else /* CONFIG_OF */
-#define gpio_nand_id_table NULL
  static inline int gpio_nand_get_config_of(const struct device *dev,
                                           struct gpio_nand_platdata *plat)
  {
@@ -259,8 +258,6 @@ static int gpio_nand_remove(struct platform_device *dev)
         if (gpio_is_valid(gpiomtd->plat.gpio_rdy))
                 gpio_free(gpiomtd->plat.gpio_rdy);
  
-       kfree(gpiomtd);
-
         return 0;
  }
  
@@ -297,7 +294,7 @@ static int gpio_nand_probe(struct platform_device *dev)
         if (!res0)
                 return -EINVAL;
  
-       gpiomtd = kzalloc(sizeof(*gpiomtd), GFP_KERNEL);
+       gpiomtd = devm_kzalloc(&dev->dev, sizeof(*gpiomtd), GFP_KERNEL);
         if (gpiomtd == NULL) {
                 dev_err(&dev->dev, "failed to create NAND MTD\n");
                 return -ENOMEM;
@@ -412,7 +409,6 @@ err_sync:
         iounmap(gpiomtd->nand_chip.IO_ADDR_R);
         release_mem_region(res0->start, resource_size(res0));
  err_map:
-       kfree(gpiomtd);
         return ret;
  }
  
@@ -421,7 +417,7 @@ static struct platform_driver gpio_nand_driver = {
         .remove         = gpio_nand_remove,
         .driver         = {
                 .name   = "gpio-nand",
-               .of_match_table = gpio_nand_id_table,
+               .of_match_table = of_match_ptr(gpio_nand_id_table),
         },
  };
  
diff --git a/drivers/mtd/nand/h1910.c b/drivers/mtd/nand/h1910.c

deleted file mode 100644 (file)

index 50166e9..0000000
--- a/drivers/mtd/nand/h1910.c
+++ /dev/null
@@ -1,167 +0,0 @@
-/*
- *  drivers/mtd/nand/h1910.c
- *
- *  Copyright (C) 2003 Joshua Wise (joshua@joshuawise.com)
- *
- *  Derived from drivers/mtd/nand/edb7312.c
- *       Copyright (C) 2002 Marius Gröger (mag@sysgo.de)
- *       Copyright (c) 2001 Thomas Gleixner (gleixner@autronix.de)
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License version 2 as
- * published by the Free Software Foundation.
- *
- *  Overview:
- *   This is a device driver for the NAND flash device found on the
- *   iPAQ h1910 board which utilizes the Samsung K9F2808 part. This is
- *   a 128Mibit (16MiB x 8 bits) NAND flash device.
- */
-
-#include <linux/slab.h>
-#include <linux/init.h>
-#include <linux/module.h>
-#include <linux/mtd/mtd.h>
-#include <linux/mtd/nand.h>
-#include <linux/mtd/partitions.h>
-#include <asm/io.h>
-#include <mach/hardware.h>
-#include <asm/sizes.h>
-#include <mach/h1900-gpio.h>
-#include <mach/ipaq.h>
-
-/*
- * MTD structure for EDB7312 board
- */
-static struct mtd_info *h1910_nand_mtd = NULL;
-
-/*
- * Module stuff
- */
-
-/*
- * Define static partitions for flash device
- */
-static struct mtd_partition partition_info[] = {
-      {name:"h1910 NAND Flash",
-             offset:0,
-      size:16 * 1024 * 1024}
-};
-
-#define NUM_PARTITIONS 1
-
-/*
- *     hardware specific access to control-lines
- *
- *     NAND_NCE: bit 0 - don't care
- *     NAND_CLE: bit 1 - address bit 2
- *     NAND_ALE: bit 2 - address bit 3
- */
-static void h1910_hwcontrol(struct mtd_info *mtd, int cmd,
-                           unsigned int ctrl)
-{
-       struct nand_chip *chip = mtd->priv;
-
-       if (cmd != NAND_CMD_NONE)
-               writeb(cmd, chip->IO_ADDR_W | ((ctrl & 0x6) << 1));
-}
-
-/*
- *     read device ready pin
- */
-#if 0
-static int h1910_device_ready(struct mtd_info *mtd)
-{
-       return (GPLR(55) & GPIO_bit(55));
-}
-#endif
-
-/*
- * Main initialization routine
- */
-static int __init h1910_init(void)
-{
-       struct nand_chip *this;
-       void __iomem *nandaddr;
-
-       if (!machine_is_h1900())
-               return -ENODEV;
-
-       nandaddr = ioremap(0x08000000, 0x1000);
-       if (!nandaddr) {
-               printk("Failed to ioremap nand flash.\n");
-               return -ENOMEM;
-       }
-
-       /* Allocate memory for MTD device structure and private data */
-       h1910_nand_mtd = kmalloc(sizeof(struct mtd_info) + sizeof(struct nand_chip), GFP_KERNEL);
-       if (!h1910_nand_mtd) {
-               printk("Unable to allocate h1910 NAND MTD device structure.\n");
-               iounmap((void *)nandaddr);
-               return -ENOMEM;
-       }
-
-       /* Get pointer to private data */
-       this = (struct nand_chip *)(&h1910_nand_mtd[1]);
-
-       /* Initialize structures */
-       memset(h1910_nand_mtd, 0, sizeof(struct mtd_info));
-       memset(this, 0, sizeof(struct nand_chip));
-
-       /* Link the private data with the MTD structure */
-       h1910_nand_mtd->priv = this;
-       h1910_nand_mtd->owner = THIS_MODULE;
-
-       /*
-        * Enable VPEN
-        */
-       GPSR(37) = GPIO_bit(37);
-
-       /* insert callbacks */
-       this->IO_ADDR_R = nandaddr;
-       this->IO_ADDR_W = nandaddr;
-       this->cmd_ctrl = h1910_hwcontrol;
-       this->dev_ready = NULL; /* unknown whether that was correct or not so we will just do it like this */
-       /* 15 us command delay time */
-       this->chip_delay = 50;
-       this->ecc.mode = NAND_ECC_SOFT;
-
-       /* Scan to find existence of the device */
-       if (nand_scan(h1910_nand_mtd, 1)) {
-               printk(KERN_NOTICE "No NAND device - returning -ENXIO\n");
-               kfree(h1910_nand_mtd);
-               iounmap((void *)nandaddr);
-               return -ENXIO;
-       }
-
-       /* Register the partitions */
-       mtd_device_parse_register(h1910_nand_mtd, NULL, NULL, partition_info,
-                                 NUM_PARTITIONS);
-
-       /* Return happy */
-       return 0;
-}
-
-module_init(h1910_init);
-
-/*
- * Clean up routine
- */
-static void __exit h1910_cleanup(void)
-{
-       struct nand_chip *this = (struct nand_chip *)&h1910_nand_mtd[1];
-
-       /* Release resources, unregister device */
-       nand_release(h1910_nand_mtd);
-
-       /* Release io resource */
-       iounmap((void *)this->IO_ADDR_W);
-
-       /* Free the MTD device structure */
-       kfree(h1910_nand_mtd);
-}
-
-module_exit(h1910_cleanup);
-
-MODULE_LICENSE("GPL");
-MODULE_AUTHOR("Joshua Wise <joshua at joshuawise dot com>");
-MODULE_DESCRIPTION("NAND flash driver for iPAQ h1910");
diff --git a/drivers/mtd/nand/lpc32xx_mlc.c b/drivers/mtd/nand/lpc32xx_mlc.c

index 0ca22ae9135c6cb9162f06c43f2b316e029e0bab..a94facb46e5ca76bb239af03a32b160571ccd467 100644 (file)
--- a/drivers/mtd/nand/lpc32xx_mlc.c
+++ b/drivers/mtd/nand/lpc32xx_mlc.c
@@ -540,8 +540,8 @@ static int lpc32xx_write_page_lowlevel(struct mtd_info *mtd,
  }
  
  static int lpc32xx_write_page(struct mtd_info *mtd, struct nand_chip *chip,
-                             const uint8_t *buf, int oob_required, int page,
-                             int cached, int raw)
+                       uint32_t offset, int data_len, const uint8_t *buf,
+                       int oob_required, int page, int cached, int raw)
  {
         int res;
  
diff --git a/drivers/mtd/nand/nand_base.c b/drivers/mtd/nand/nand_base.c

index 42c63927609dbee8697c691c8e97515cf509b990..dfcd0a565c5b3e8f66d9b24077ae3701f132ee36 100644 (file)
--- a/drivers/mtd/nand/nand_base.c
+++ b/drivers/mtd/nand/nand_base.c
@@ -4,7 +4,6 @@
   *  Overview:
   *   This is the generic MTD driver for NAND flash devices. It should be
   *   capable of working with almost all NAND chips currently available.
- *   Basic support for AG-AND chips is provided.
   *
   *     Additional technical information is available on
   *     http://www.linux-mtd.infradead.org/doc/nand.html
@@ -22,8 +21,6 @@
   *     Enable cached programming for 2k page size chips
   *     Check, if mtd->ecctype should be set to MTD_ECC_HW
   *     if we have HW ECC support.
- *     The AG-AND chips have nice features for speed improvement,
- *     which are not supported yet. Read / program 4 pages in one go.
   *     BBT table is not serialized, has to be fixed
   *
   * This program is free software; you can redistribute it and/or modify
@@ -515,7 +512,7 @@ EXPORT_SYMBOL_GPL(nand_wait_ready);
   * @page_addr: the page address for this command, -1 if none
   *
   * Send command to NAND device. This function is used for small page devices
- * (256/512 Bytes per page).
+ * (512 Bytes per page).
   */
  static void nand_command(struct mtd_info *mtd, unsigned int command,
                          int column, int page_addr)
@@ -631,8 +628,7 @@ static void nand_command_lp(struct mtd_info *mtd, unsigned int command,
         }
  
         /* Command latch cycle */
-       chip->cmd_ctrl(mtd, command & 0xff,
-                      NAND_NCE | NAND_CLE | NAND_CTRL_CHANGE);
+       chip->cmd_ctrl(mtd, command, NAND_NCE | NAND_CLE | NAND_CTRL_CHANGE);
  
         if (column != -1 || page_addr != -1) {
                 int ctrl = NAND_CTRL_CHANGE | NAND_NCE | NAND_ALE;
@@ -671,16 +667,6 @@ static void nand_command_lp(struct mtd_info *mtd, unsigned int command,
         case NAND_CMD_SEQIN:
         case NAND_CMD_RNDIN:
         case NAND_CMD_STATUS:
-       case NAND_CMD_DEPLETE1:
-               return;
-
-       case NAND_CMD_STATUS_ERROR:
-       case NAND_CMD_STATUS_ERROR0:
-       case NAND_CMD_STATUS_ERROR1:
-       case NAND_CMD_STATUS_ERROR2:
-       case NAND_CMD_STATUS_ERROR3:
-               /* Read error status commands require only a short delay */
-               udelay(chip->chip_delay);
                 return;
  
         case NAND_CMD_RESET:
@@ -836,10 +822,7 @@ static int nand_wait(struct mtd_info *mtd, struct nand_chip *chip)
          */
         ndelay(100);
  
-       if ((state == FL_ERASING) && (chip->options & NAND_IS_AND))
-               chip->cmdfunc(mtd, NAND_CMD_STATUS_MULTI, -1, -1);
-       else
-               chip->cmdfunc(mtd, NAND_CMD_STATUS, -1, -1);
+       chip->cmdfunc(mtd, NAND_CMD_STATUS, -1, -1);
  
         if (in_interrupt() || oops_in_progress)
                 panic_nand_wait(mtd, chip, timeo);
@@ -1127,7 +1110,7 @@ static int nand_read_page_swecc(struct mtd_info *mtd, struct nand_chip *chip,
  }
  
  /**
- * nand_read_subpage - [REPLACEABLE] software ECC based sub-page read function
+ * nand_read_subpage - [REPLACEABLE] ECC based sub-page read function
   * @mtd: mtd info structure
   * @chip: nand chip info structure
   * @data_offs: offset of requested data within the page
@@ -1995,6 +1978,67 @@ static int nand_write_page_hwecc(struct mtd_info *mtd, struct nand_chip *chip,
         return 0;
  }
  
+
+/**
+ * nand_write_subpage_hwecc - [REPLACABLE] hardware ECC based subpage write
+ * @mtd:       mtd info structure
+ * @chip:      nand chip info structure
+ * @column:    column address of subpage within the page
+ * @data_len:  data length
+ * @oob_required: must write chip->oob_poi to OOB
+ */
+static int nand_write_subpage_hwecc(struct mtd_info *mtd,
+                               struct nand_chip *chip, uint32_t offset,
+                               uint32_t data_len, const uint8_t *data_buf,
+                               int oob_required)
+{
+       uint8_t *oob_buf  = chip->oob_poi;
+       uint8_t *ecc_calc = chip->buffers->ecccalc;
+       int ecc_size      = chip->ecc.size;
+       int ecc_bytes     = chip->ecc.bytes;
+       int ecc_steps     = chip->ecc.steps;
+       uint32_t *eccpos  = chip->ecc.layout->eccpos;
+       uint32_t start_step = offset / ecc_size;
+       uint32_t end_step   = (offset + data_len - 1) / ecc_size;
+       int oob_bytes       = mtd->oobsize / ecc_steps;
+       int step, i;
+
+       for (step = 0; step < ecc_steps; step++) {
+               /* configure controller for WRITE access */
+               chip->ecc.hwctl(mtd, NAND_ECC_WRITE);
+
+               /* write data (untouched subpages already masked by 0xFF) */
+               chip->write_buf(mtd, data_buf, ecc_size);
+
+               /* mask ECC of un-touched subpages by padding 0xFF */
+               if ((step < start_step) || (step > end_step))
+                       memset(ecc_calc, 0xff, ecc_bytes);
+               else
+                       chip->ecc.calculate(mtd, data_buf, ecc_calc);
+
+               /* mask OOB of un-touched subpages by padding 0xFF */
+               /* if oob_required, preserve OOB metadata of written subpage */
+               if (!oob_required || (step < start_step) || (step > end_step))
+                       memset(oob_buf, 0xff, oob_bytes);
+
+               data_buf += ecc_size;
+               ecc_calc += ecc_bytes;
+               oob_buf  += oob_bytes;
+       }
+
+       /* copy calculated ECC for whole page to chip->buffer->oob */
+       /* this include masked-value(0xFF) for unwritten subpages */
+       ecc_calc = chip->buffers->ecccalc;
+       for (i = 0; i < chip->ecc.total; i++)
+               chip->oob_poi[eccpos[i]] = ecc_calc[i];
+
+       /* write OOB buffer to NAND device */
+       chip->write_buf(mtd, chip->oob_poi, mtd->oobsize);
+
+       return 0;
+}
+
+
  /**
   * nand_write_page_syndrome - [REPLACEABLE] hardware ECC syndrome based page write
   * @mtd: mtd info structure
@@ -2047,6 +2091,8 @@ static int nand_write_page_syndrome(struct mtd_info *mtd,
   * nand_write_page - [REPLACEABLE] write one page
   * @mtd: MTD device structure
   * @chip: NAND chip descriptor
+ * @offset: address offset within the page
+ * @data_len: length of actual data to be written
   * @buf: the data to write
   * @oob_required: must write chip->oob_poi to OOB
   * @page: page number to write
@@ -2054,15 +2100,25 @@ static int nand_write_page_syndrome(struct mtd_info *mtd,
   * @raw: use _raw version of write_page
   */
  static int nand_write_page(struct mtd_info *mtd, struct nand_chip *chip,
-                          const uint8_t *buf, int oob_required, int page,
-                          int cached, int raw)
+               uint32_t offset, int data_len, const uint8_t *buf,
+               int oob_required, int page, int cached, int raw)
  {
-       int status;
+       int status, subpage;
+
+       if (!(chip->options & NAND_NO_SUBPAGE_WRITE) &&
+               chip->ecc.write_subpage)
+               subpage = offset || (data_len < mtd->writesize);
+       else
+               subpage = 0;
  
         chip->cmdfunc(mtd, NAND_CMD_SEQIN, 0x00, page);
  
         if (unlikely(raw))
-               status = chip->ecc.write_page_raw(mtd, chip, buf, oob_required);
+               status = chip->ecc.write_page_raw(mtd, chip, buf,
+                                                       oob_required);
+       else if (subpage)
+               status = chip->ecc.write_subpage(mtd, chip, offset, data_len,
+                                                        buf, oob_required);
         else
                 status = chip->ecc.write_page(mtd, chip, buf, oob_required);
  
@@ -2075,7 +2131,7 @@ static int nand_write_page(struct mtd_info *mtd, struct nand_chip *chip,
          */
         cached = 0;
  
-       if (!cached || !(chip->options & NAND_CACHEPRG)) {
+       if (!cached || !NAND_HAS_CACHEPROG(chip)) {
  
                 chip->cmdfunc(mtd, NAND_CMD_PAGEPROG, -1, -1);
                 status = chip->waitfunc(mtd, chip);
@@ -2176,7 +2232,7 @@ static int nand_do_write_ops(struct mtd_info *mtd, loff_t to,
  
         uint8_t *oob = ops->oobbuf;
         uint8_t *buf = ops->datbuf;
-       int ret, subpage;
+       int ret;
         int oob_required = oob ? 1 : 0;
  
         ops->retlen = 0;
@@ -2191,10 +2247,6 @@ static int nand_do_write_ops(struct mtd_info *mtd, loff_t to,
         }
  
         column = to & (mtd->writesize - 1);
-       subpage = column || (writelen & (mtd->writesize - 1));
-
-       if (subpage && oob)
-               return -EINVAL;
  
         chipnr = (int)(to >> chip->chip_shift);
         chip->select_chip(mtd, chipnr);
@@ -2243,9 +2295,9 @@ static int nand_do_write_ops(struct mtd_info *mtd, loff_t to,
                         /* We still need to erase leftover OOB data */
                         memset(chip->oob_poi, 0xff, mtd->oobsize);
                 }
-
-               ret = chip->write_page(mtd, chip, wbuf, oob_required, page,
-                                      cached, (ops->mode == MTD_OPS_RAW));
+               ret = chip->write_page(mtd, chip, column, bytes, wbuf,
+                                       oob_required, page, cached,
+                                       (ops->mode == MTD_OPS_RAW));
                 if (ret)
                         break;
  
@@ -2480,24 +2532,6 @@ static void single_erase_cmd(struct mtd_info *mtd, int page)
         chip->cmdfunc(mtd, NAND_CMD_ERASE2, -1, -1);
  }
  
-/**
- * multi_erase_cmd - [GENERIC] AND specific block erase command function
- * @mtd: MTD device structure
- * @page: the page address of the block which will be erased
- *
- * AND multi block erase command function. Erase 4 consecutive blocks.
- */
-static void multi_erase_cmd(struct mtd_info *mtd, int page)
-{
-       struct nand_chip *chip = mtd->priv;
-       /* Send commands to erase a block */
-       chip->cmdfunc(mtd, NAND_CMD_ERASE1, -1, page++);
-       chip->cmdfunc(mtd, NAND_CMD_ERASE1, -1, page++);
-       chip->cmdfunc(mtd, NAND_CMD_ERASE1, -1, page++);
-       chip->cmdfunc(mtd, NAND_CMD_ERASE1, -1, page);
-       chip->cmdfunc(mtd, NAND_CMD_ERASE2, -1, -1);
-}
-
  /**
   * nand_erase - [MTD Interface] erase block(s)
   * @mtd: MTD device structure
@@ -2510,7 +2544,6 @@ static int nand_erase(struct mtd_info *mtd, struct erase_info *instr)
         return nand_erase_nand(mtd, instr, 0);
  }
  
-#define BBT_PAGE_MASK  0xffffff3f
  /**
   * nand_erase_nand - [INTERN] erase block(s)
   * @mtd: MTD device structure
@@ -2524,8 +2557,6 @@ int nand_erase_nand(struct mtd_info *mtd, struct erase_info *instr,
  {
         int page, status, pages_per_block, ret, chipnr;
         struct nand_chip *chip = mtd->priv;
-       loff_t rewrite_bbt[NAND_MAX_CHIPS] = {0};
-       unsigned int bbt_masked_page = 0xffffffff;
         loff_t len;
  
         pr_debug("%s: start = 0x%012llx, len = %llu\n",
@@ -2556,15 +2587,6 @@ int nand_erase_nand(struct mtd_info *mtd, struct erase_info *instr,
                 goto erase_exit;
         }
  
-       /*
-        * If BBT requires refresh, set the BBT page mask to see if the BBT
-        * should be rewritten. Otherwise the mask is set to 0xffffffff which
-        * can not be matched. This is also done when the bbt is actually
-        * erased to avoid recursive updates.
-        */
-       if (chip->options & BBT_AUTO_REFRESH && !allowbbt)
-               bbt_masked_page = chip->bbt_td->pages[chipnr] & BBT_PAGE_MASK;
-
         /* Loop through the pages */
         len = instr->len;
  
@@ -2610,15 +2632,6 @@ int nand_erase_nand(struct mtd_info *mtd, struct erase_info *instr,
                         goto erase_exit;
                 }
  
-               /*
-                * If BBT requires refresh, set the BBT rewrite flag to the
-                * page being erased.
-                */
-               if (bbt_masked_page != 0xffffffff &&
-                   (page & BBT_PAGE_MASK) == bbt_masked_page)
-                           rewrite_bbt[chipnr] =
-                                       ((loff_t)page << chip->page_shift);
-
                 /* Increment page address and decrement length */
                 len -= (1 << chip->phys_erase_shift);
                 page += pages_per_block;
@@ -2628,15 +2641,6 @@ int nand_erase_nand(struct mtd_info *mtd, struct erase_info *instr,
                         chipnr++;
                         chip->select_chip(mtd, -1);
                         chip->select_chip(mtd, chipnr);
-
-                       /*
-                        * If BBT requires refresh and BBT-PERCHIP, set the BBT
-                        * page mask to see if this BBT should be rewritten.
-                        */
-                       if (bbt_masked_page != 0xffffffff &&
-                           (chip->bbt_td->options & NAND_BBT_PERCHIP))
-                               bbt_masked_page = chip->bbt_td->pages[chipnr] &
-                                       BBT_PAGE_MASK;
                 }
         }
         instr->state = MTD_ERASE_DONE;
@@ -2653,23 +2657,6 @@ erase_exit:
         if (!ret)
                 mtd_erase_callback(instr);
  
-       /*
-        * If BBT requires refresh and erase was successful, rewrite any
-        * selected bad block tables.
-        */
-       if (bbt_masked_page == 0xffffffff || ret)
-               return ret;
-
-       for (chipnr = 0; chipnr < chip->numchips; chipnr++) {
-               if (!rewrite_bbt[chipnr])
-                       continue;
-               /* Update the BBT for chip */
-               pr_debug("%s: nand_update_bbt (%d:0x%0llx 0x%0x)\n",
-                               __func__, chipnr, rewrite_bbt[chipnr],
-                               chip->bbt_td->pages[chipnr]);
-               nand_update_bbt(mtd, rewrite_bbt[chipnr]);
-       }
-
         /* Return more or less happy */
         return ret;
  }
@@ -2905,8 +2892,6 @@ static int nand_flash_detect_onfi(struct mtd_info *mtd, struct nand_chip *chip,
                 chip->onfi_version = 20;
         else if (val & (1 << 1))
                 chip->onfi_version = 10;
-       else
-               chip->onfi_version = 0;
  
         if (!chip->onfi_version) {
                 pr_info("%s: unsupported ONFI version: %d\n", __func__, val);
@@ -3171,6 +3156,30 @@ static void nand_decode_bbm_options(struct mtd_info *mtd,
                 chip->bbt_options |= NAND_BBT_SCAN2NDPAGE;
  }
  
+static inline bool is_full_id_nand(struct nand_flash_dev *type)
+{
+       return type->id_len;
+}
+
+static bool find_full_id_nand(struct mtd_info *mtd, struct nand_chip *chip,
+                  struct nand_flash_dev *type, u8 *id_data, int *busw)
+{
+       if (!strncmp(type->id, id_data, type->id_len)) {
+               mtd->writesize = type->pagesize;
+               mtd->erasesize = type->erasesize;
+               mtd->oobsize = type->oobsize;
+
+               chip->cellinfo = id_data[2];
+               chip->chipsize = (uint64_t)type->chipsize << 20;
+               chip->options |= type->options;
+
+               *busw = type->options & NAND_BUSWIDTH_16;
+
+               return true;
+       }
+       return false;
+}
+
  /*
   * Get the flash and manufacturer id and lookup if the type is supported.
   */
@@ -3222,9 +3231,14 @@ static struct nand_flash_dev *nand_get_flash_type(struct mtd_info *mtd,
         if (!type)
                 type = nand_flash_ids;
  
-       for (; type->name != NULL; type++)
-               if (*dev_id == type->id)
-                       break;
+       for (; type->name != NULL; type++) {
+               if (is_full_id_nand(type)) {
+                       if (find_full_id_nand(mtd, chip, type, id_data, &busw))
+                               goto ident_done;
+               } else if (*dev_id == type->dev_id) {
+                               break;
+               }
+       }
  
         chip->onfi_version = 0;
         if (!type->name || !type->pagesize) {
@@ -3302,12 +3316,7 @@ ident_done:
         }
  
         chip->badblockbits = 8;
-
-       /* Check for AND chips with 4 page planes */
-       if (chip->options & NAND_4PAGE_ARRAY)
-               chip->erase_cmd = multi_erase_cmd;
-       else
-               chip->erase_cmd = single_erase_cmd;
+       chip->erase_cmd = single_erase_cmd;
  
         /* Do not replace user supplied command function! */
         if (mtd->writesize > 512 && chip->cmdfunc == nand_command)
@@ -3474,6 +3483,10 @@ int nand_scan_tail(struct mtd_info *mtd)
                         chip->ecc.read_oob = nand_read_oob_std;
                 if (!chip->ecc.write_oob)
                         chip->ecc.write_oob = nand_write_oob_std;
+               if (!chip->ecc.read_subpage)
+                       chip->ecc.read_subpage = nand_read_subpage;
+               if (!chip->ecc.write_subpage)
+                       chip->ecc.write_subpage = nand_write_subpage_hwecc;
  
         case NAND_ECC_HW_SYNDROME:
                 if ((!chip->ecc.calculate || !chip->ecc.correct ||
diff --git a/drivers/mtd/nand/nand_bbt.c b/drivers/mtd/nand/nand_bbt.c

index 916d6e9c0ab1d9c5872c0ed66b26832ff4bf9062..267264320e06587cbee58ad5e036a38b90f07fe9 100644 (file)
--- a/drivers/mtd/nand/nand_bbt.c
+++ b/drivers/mtd/nand/nand_bbt.c
@@ -1240,15 +1240,6 @@ int nand_update_bbt(struct mtd_info *mtd, loff_t offs)
   */
  static uint8_t scan_ff_pattern[] = { 0xff, 0xff };
  
-static uint8_t scan_agand_pattern[] = { 0x1C, 0x71, 0xC7, 0x1C, 0x71, 0xC7 };
-
-static struct nand_bbt_descr agand_flashbased = {
-       .options = NAND_BBT_SCANEMPTY | NAND_BBT_SCANALLPAGES,
-       .offs = 0x20,
-       .len = 6,
-       .pattern = scan_agand_pattern
-};
-
  /* Generic flash bbt descriptors */
  static uint8_t bbt_pattern[] = {'B', 'b', 't', '0' };
  static uint8_t mirror_pattern[] = {'1', 't', 'b', 'B' };
@@ -1333,22 +1324,6 @@ int nand_default_bbt(struct mtd_info *mtd)
  {
         struct nand_chip *this = mtd->priv;
  
-       /*
-        * Default for AG-AND. We must use a flash based bad block table as the
-        * devices have factory marked _good_ blocks. Erasing those blocks
-        * leads to loss of the good / bad information, so we _must_ store this
-        * information in a good / bad table during startup.
-        */
-       if (this->options & NAND_IS_AND) {
-               /* Use the default pattern descriptors */
-               if (!this->bbt_td) {
-                       this->bbt_td = &bbt_main_descr;
-                       this->bbt_md = &bbt_mirror_descr;
-               }
-               this->bbt_options |= NAND_BBT_USE_FLASH;
-               return nand_scan_bbt(mtd, &agand_flashbased);
-       }
-
         /* Is a flash based bad block table requested? */
         if (this->bbt_options & NAND_BBT_USE_FLASH) {
                 /* Use the default pattern descriptors */
diff --git a/drivers/mtd/nand/nand_ids.c b/drivers/mtd/nand/nand_ids.c

index 9c612388e5deff15def3bc8014fd1d6408222142..683813a46a905f489feabaced468efd88abc136d 100644 (file)
--- a/drivers/mtd/nand/nand_ids.c
+++ b/drivers/mtd/nand/nand_ids.c
@@ -10,163 +10,153 @@
   */
  #include <linux/module.h>
  #include <linux/mtd/nand.h>
-/*
-*      Chip ID list
-*
-*      Name. ID code, pagesize, chipsize in MegaByte, eraseblock size,
-*      options
-*
-*      Pagesize; 0, 256, 512
-*      0       get this information from the extended chip ID
-+      256     256 Byte page size
-*      512     512 Byte page size
-*/
-struct nand_flash_dev nand_flash_ids[] = {
+#include <linux/sizes.h>
+
+#define LP_OPTIONS NAND_SAMSUNG_LP_OPTIONS
+#define LP_OPTIONS16 (LP_OPTIONS | NAND_BUSWIDTH_16)
+
  #define SP_OPTIONS NAND_NEED_READRDY
  #define SP_OPTIONS16 (SP_OPTIONS | NAND_BUSWIDTH_16)
  
-#ifdef CONFIG_MTD_NAND_MUSEUM_IDS
-       {"NAND 1MiB 5V 8-bit",          0x6e, 256, 1, 0x1000, SP_OPTIONS},
-       {"NAND 2MiB 5V 8-bit",          0x64, 256, 2, 0x1000, SP_OPTIONS},
-       {"NAND 4MiB 5V 8-bit",          0x6b, 512, 4, 0x2000, SP_OPTIONS},
-       {"NAND 1MiB 3,3V 8-bit",        0xe8, 256, 1, 0x1000, SP_OPTIONS},
-       {"NAND 1MiB 3,3V 8-bit",        0xec, 256, 1, 0x1000, SP_OPTIONS},
-       {"NAND 2MiB 3,3V 8-bit",        0xea, 256, 2, 0x1000, SP_OPTIONS},
-       {"NAND 4MiB 3,3V 8-bit",        0xd5, 512, 4, 0x2000, SP_OPTIONS},
-       {"NAND 4MiB 3,3V 8-bit",        0xe3, 512, 4, 0x2000, SP_OPTIONS},
-       {"NAND 4MiB 3,3V 8-bit",        0xe5, 512, 4, 0x2000, SP_OPTIONS},
-       {"NAND 8MiB 3,3V 8-bit",        0xd6, 512, 8, 0x2000, SP_OPTIONS},
-
-       {"NAND 8MiB 1,8V 8-bit",        0x39, 512, 8, 0x2000, SP_OPTIONS},
-       {"NAND 8MiB 3,3V 8-bit",        0xe6, 512, 8, 0x2000, SP_OPTIONS},
-       {"NAND 8MiB 1,8V 16-bit",       0x49, 512, 8, 0x2000, SP_OPTIONS16},
-       {"NAND 8MiB 3,3V 16-bit",       0x59, 512, 8, 0x2000, SP_OPTIONS16},
-#endif
-
-       {"NAND 16MiB 1,8V 8-bit",       0x33, 512, 16, 0x4000, SP_OPTIONS},
-       {"NAND 16MiB 3,3V 8-bit",       0x73, 512, 16, 0x4000, SP_OPTIONS},
-       {"NAND 16MiB 1,8V 16-bit",      0x43, 512, 16, 0x4000, SP_OPTIONS16},
-       {"NAND 16MiB 3,3V 16-bit",      0x53, 512, 16, 0x4000, SP_OPTIONS16},
-
-       {"NAND 32MiB 1,8V 8-bit",       0x35, 512, 32, 0x4000, SP_OPTIONS},
-       {"NAND 32MiB 3,3V 8-bit",       0x75, 512, 32, 0x4000, SP_OPTIONS},
-       {"NAND 32MiB 1,8V 16-bit",      0x45, 512, 32, 0x4000, SP_OPTIONS16},
-       {"NAND 32MiB 3,3V 16-bit",      0x55, 512, 32, 0x4000, SP_OPTIONS16},
-
-       {"NAND 64MiB 1,8V 8-bit",       0x36, 512, 64, 0x4000, SP_OPTIONS},
-       {"NAND 64MiB 3,3V 8-bit",       0x76, 512, 64, 0x4000, SP_OPTIONS},
-       {"NAND 64MiB 1,8V 16-bit",      0x46, 512, 64, 0x4000, SP_OPTIONS16},
-       {"NAND 64MiB 3,3V 16-bit",      0x56, 512, 64, 0x4000, SP_OPTIONS16},
-
-       {"NAND 128MiB 1,8V 8-bit",      0x78, 512, 128, 0x4000, SP_OPTIONS},
-       {"NAND 128MiB 1,8V 8-bit",      0x39, 512, 128, 0x4000, SP_OPTIONS},
-       {"NAND 128MiB 3,3V 8-bit",      0x79, 512, 128, 0x4000, SP_OPTIONS},
-       {"NAND 128MiB 1,8V 16-bit",     0x72, 512, 128, 0x4000, SP_OPTIONS16},
-       {"NAND 128MiB 1,8V 16-bit",     0x49, 512, 128, 0x4000, SP_OPTIONS16},
-       {"NAND 128MiB 3,3V 16-bit",     0x74, 512, 128, 0x4000, SP_OPTIONS16},
-       {"NAND 128MiB 3,3V 16-bit",     0x59, 512, 128, 0x4000, SP_OPTIONS16},
-
-       {"NAND 256MiB 3,3V 8-bit",      0x71, 512, 256, 0x4000, SP_OPTIONS},
+/*
+ * The chip ID list:
+ *    name, device ID, page size, chip size in MiB, eraseblock size, options
+ *
+ * If page size and eraseblock size are 0, the sizes are taken from the
+ * extended chip ID.
+ */
+struct nand_flash_dev nand_flash_ids[] = {
+       /*
+        * Some incompatible NAND chips share device ID's and so must be
+        * listed by full ID. We list them first so that we can easily identify
+        * the most specific match.
+        */
+       {"TC58NVG2S0F 4G 3.3V 8-bit",
+               { .id = {0x98, 0xdc, 0x90, 0x26, 0x76, 0x15, 0x01, 0x08} },
+                 SZ_4K, SZ_512, SZ_256K, 0, 8, 224},
+       {"TC58NVG3S0F 8G 3.3V 8-bit",
+               { .id = {0x98, 0xd3, 0x90, 0x26, 0x76, 0x15, 0x02, 0x08} },
+                 SZ_4K, SZ_1K, SZ_256K, 0, 8, 232},
+       {"TC58NVG5D2 32G 3.3V 8-bit",
+               { .id = {0x98, 0xd7, 0x94, 0x32, 0x76, 0x56, 0x09, 0x00} },
+                 SZ_8K, SZ_4K, SZ_1M, 0, 8, 640},
+       {"TC58NVG6D2 64G 3.3V 8-bit",
+               { .id = {0x98, 0xde, 0x94, 0x82, 0x76, 0x56, 0x04, 0x20} },
+                 SZ_8K, SZ_8K, SZ_2M, 0, 8, 640},
+
+       LEGACY_ID_NAND("NAND 4MiB 5V 8-bit",   0x6B, 4, SZ_8K, SP_OPTIONS),
+       LEGACY_ID_NAND("NAND 4MiB 3,3V 8-bit", 0xE3, 4, SZ_8K, SP_OPTIONS),
+       LEGACY_ID_NAND("NAND 4MiB 3,3V 8-bit", 0xE5, 4, SZ_8K, SP_OPTIONS),
+       LEGACY_ID_NAND("NAND 8MiB 3,3V 8-bit", 0xD6, 8, SZ_8K, SP_OPTIONS),
+       LEGACY_ID_NAND("NAND 8MiB 3,3V 8-bit", 0xE6, 8, SZ_8K, SP_OPTIONS),
+
+       LEGACY_ID_NAND("NAND 16MiB 1,8V 8-bit",  0x33, 16, SZ_16K, SP_OPTIONS),
+       LEGACY_ID_NAND("NAND 16MiB 3,3V 8-bit",  0x73, 16, SZ_16K, SP_OPTIONS),
+       LEGACY_ID_NAND("NAND 16MiB 1,8V 16-bit", 0x43, 16, SZ_16K, SP_OPTIONS16),
+       LEGACY_ID_NAND("NAND 16MiB 3,3V 16-bit", 0x53, 16, SZ_16K, SP_OPTIONS16),
+
+       LEGACY_ID_NAND("NAND 32MiB 1,8V 8-bit",  0x35, 32, SZ_16K, SP_OPTIONS),
+       LEGACY_ID_NAND("NAND 32MiB 3,3V 8-bit",  0x75, 32, SZ_16K, SP_OPTIONS),
+       LEGACY_ID_NAND("NAND 32MiB 1,8V 16-bit", 0x45, 32, SZ_16K, SP_OPTIONS16),
+       LEGACY_ID_NAND("NAND 32MiB 3,3V 16-bit", 0x55, 32, SZ_16K, SP_OPTIONS16),
+
+       LEGACY_ID_NAND("NAND 64MiB 1,8V 8-bit",  0x36, 64, SZ_16K, SP_OPTIONS),
+       LEGACY_ID_NAND("NAND 64MiB 3,3V 8-bit",  0x76, 64, SZ_16K, SP_OPTIONS),
+       LEGACY_ID_NAND("NAND 64MiB 1,8V 16-bit", 0x46, 64, SZ_16K, SP_OPTIONS16),
+       LEGACY_ID_NAND("NAND 64MiB 3,3V 16-bit", 0x56, 64, SZ_16K, SP_OPTIONS16),
+
+       LEGACY_ID_NAND("NAND 128MiB 1,8V 8-bit",  0x78, 128, SZ_16K, SP_OPTIONS),
+       LEGACY_ID_NAND("NAND 128MiB 1,8V 8-bit",  0x39, 128, SZ_16K, SP_OPTIONS),
+       LEGACY_ID_NAND("NAND 128MiB 3,3V 8-bit",  0x79, 128, SZ_16K, SP_OPTIONS),
+       LEGACY_ID_NAND("NAND 128MiB 1,8V 16-bit", 0x72, 128, SZ_16K, SP_OPTIONS16),
+       LEGACY_ID_NAND("NAND 128MiB 1,8V 16-bit", 0x49, 128, SZ_16K, SP_OPTIONS16),
+       LEGACY_ID_NAND("NAND 128MiB 3,3V 16-bit", 0x74, 128, SZ_16K, SP_OPTIONS16),
+       LEGACY_ID_NAND("NAND 128MiB 3,3V 16-bit", 0x59, 128, SZ_16K, SP_OPTIONS16),
+
+       LEGACY_ID_NAND("NAND 256MiB 3,3V 8-bit", 0x71, 256, SZ_16K, SP_OPTIONS),
  
         /*
-        * These are the new chips with large page size. The pagesize and the
-        * erasesize is determined from the extended id bytes
+        * These are the new chips with large page size. Their page size and
+        * eraseblock size are determined from the extended ID bytes.
          */
-#define LP_OPTIONS NAND_SAMSUNG_LP_OPTIONS
-#define LP_OPTIONS16 (LP_OPTIONS | NAND_BUSWIDTH_16)
  
         /* 512 Megabit */
-       {"NAND 64MiB 1,8V 8-bit",       0xA2, 0,  64, 0, LP_OPTIONS},
-       {"NAND 64MiB 1,8V 8-bit",       0xA0, 0,  64, 0, LP_OPTIONS},
-       {"NAND 64MiB 3,3V 8-bit",       0xF2, 0,  64, 0, LP_OPTIONS},
-       {"NAND 64MiB 3,3V 8-bit",       0xD0, 0,  64, 0, LP_OPTIONS},
-       {"NAND 64MiB 3,3V 8-bit",       0xF0, 0,  64, 0, LP_OPTIONS},
-       {"NAND 64MiB 1,8V 16-bit",      0xB2, 0,  64, 0, LP_OPTIONS16},
-       {"NAND 64MiB 1,8V 16-bit",      0xB0, 0,  64, 0, LP_OPTIONS16},
-       {"NAND 64MiB 3,3V 16-bit",      0xC2, 0,  64, 0, LP_OPTIONS16},
-       {"NAND 64MiB 3,3V 16-bit",      0xC0, 0,  64, 0, LP_OPTIONS16},
+       EXTENDED_ID_NAND("NAND 64MiB 1,8V 8-bit",  0xA2,  64, LP_OPTIONS),
+       EXTENDED_ID_NAND("NAND 64MiB 1,8V 8-bit",  0xA0,  64, LP_OPTIONS),
+       EXTENDED_ID_NAND("NAND 64MiB 3,3V 8-bit",  0xF2,  64, LP_OPTIONS),
+       EXTENDED_ID_NAND("NAND 64MiB 3,3V 8-bit",  0xD0,  64, LP_OPTIONS),
+       EXTENDED_ID_NAND("NAND 64MiB 3,3V 8-bit",  0xF0,  64, LP_OPTIONS),
+       EXTENDED_ID_NAND("NAND 64MiB 1,8V 16-bit", 0xB2,  64, LP_OPTIONS16),
+       EXTENDED_ID_NAND("NAND 64MiB 1,8V 16-bit", 0xB0,  64, LP_OPTIONS16),
+       EXTENDED_ID_NAND("NAND 64MiB 3,3V 16-bit", 0xC2,  64, LP_OPTIONS16),
+       EXTENDED_ID_NAND("NAND 64MiB 3,3V 16-bit", 0xC0,  64, LP_OPTIONS16),
  
         /* 1 Gigabit */
-       {"NAND 128MiB 1,8V 8-bit",      0xA1, 0, 128, 0, LP_OPTIONS},
-       {"NAND 128MiB 3,3V 8-bit",      0xF1, 0, 128, 0, LP_OPTIONS},
-       {"NAND 128MiB 3,3V 8-bit",      0xD1, 0, 128, 0, LP_OPTIONS},
-       {"NAND 128MiB 1,8V 16-bit",     0xB1, 0, 128, 0, LP_OPTIONS16},
-       {"NAND 128MiB 3,3V 16-bit",     0xC1, 0, 128, 0, LP_OPTIONS16},
-       {"NAND 128MiB 1,8V 16-bit",     0xAD, 0, 128, 0, LP_OPTIONS16},
+       EXTENDED_ID_NAND("NAND 128MiB 1,8V 8-bit",  0xA1, 128, LP_OPTIONS),
+       EXTENDED_ID_NAND("NAND 128MiB 3,3V 8-bit",  0xF1, 128, LP_OPTIONS),
+       EXTENDED_ID_NAND("NAND 128MiB 3,3V 8-bit",  0xD1, 128, LP_OPTIONS),
+       EXTENDED_ID_NAND("NAND 128MiB 1,8V 16-bit", 0xB1, 128, LP_OPTIONS16),
+       EXTENDED_ID_NAND("NAND 128MiB 3,3V 16-bit", 0xC1, 128, LP_OPTIONS16),
+       EXTENDED_ID_NAND("NAND 128MiB 1,8V 16-bit", 0xAD, 128, LP_OPTIONS16),
  
         /* 2 Gigabit */
-       {"NAND 256MiB 1,8V 8-bit",      0xAA, 0, 256, 0, LP_OPTIONS},
-       {"NAND 256MiB 3,3V 8-bit",      0xDA, 0, 256, 0, LP_OPTIONS},
-       {"NAND 256MiB 1,8V 16-bit",     0xBA, 0, 256, 0, LP_OPTIONS16},
-       {"NAND 256MiB 3,3V 16-bit",     0xCA, 0, 256, 0, LP_OPTIONS16},
+       EXTENDED_ID_NAND("NAND 256MiB 1,8V 8-bit",  0xAA, 256, LP_OPTIONS),
+       EXTENDED_ID_NAND("NAND 256MiB 3,3V 8-bit",  0xDA, 256, LP_OPTIONS),
+       EXTENDED_ID_NAND("NAND 256MiB 1,8V 16-bit", 0xBA, 256, LP_OPTIONS16),
+       EXTENDED_ID_NAND("NAND 256MiB 3,3V 16-bit", 0xCA, 256, LP_OPTIONS16),
  
         /* 4 Gigabit */
-       {"NAND 512MiB 1,8V 8-bit",      0xAC, 0, 512, 0, LP_OPTIONS},
-       {"NAND 512MiB 3,3V 8-bit",      0xDC, 0, 512, 0, LP_OPTIONS},
-       {"NAND 512MiB 1,8V 16-bit",     0xBC, 0, 512, 0, LP_OPTIONS16},
-       {"NAND 512MiB 3,3V 16-bit",     0xCC, 0, 512, 0, LP_OPTIONS16},
+       EXTENDED_ID_NAND("NAND 512MiB 1,8V 8-bit",  0xAC, 512, LP_OPTIONS),
+       EXTENDED_ID_NAND("NAND 512MiB 3,3V 8-bit",  0xDC, 512, LP_OPTIONS),
+       EXTENDED_ID_NAND("NAND 512MiB 1,8V 16-bit", 0xBC, 512, LP_OPTIONS16),
+       EXTENDED_ID_NAND("NAND 512MiB 3,3V 16-bit", 0xCC, 512, LP_OPTIONS16),
  
         /* 8 Gigabit */
-       {"NAND 1GiB 1,8V 8-bit",        0xA3, 0, 1024, 0, LP_OPTIONS},
-       {"NAND 1GiB 3,3V 8-bit",        0xD3, 0, 1024, 0, LP_OPTIONS},
-       {"NAND 1GiB 1,8V 16-bit",       0xB3, 0, 1024, 0, LP_OPTIONS16},
-       {"NAND 1GiB 3,3V 16-bit",       0xC3, 0, 1024, 0, LP_OPTIONS16},
+       EXTENDED_ID_NAND("NAND 1GiB 1,8V 8-bit",  0xA3, 1024, LP_OPTIONS),
+       EXTENDED_ID_NAND("NAND 1GiB 3,3V 8-bit",  0xD3, 1024, LP_OPTIONS),
+       EXTENDED_ID_NAND("NAND 1GiB 1,8V 16-bit", 0xB3, 1024, LP_OPTIONS16),
+       EXTENDED_ID_NAND("NAND 1GiB 3,3V 16-bit", 0xC3, 1024, LP_OPTIONS16),
  
         /* 16 Gigabit */
-       {"NAND 2GiB 1,8V 8-bit",        0xA5, 0, 2048, 0, LP_OPTIONS},
-       {"NAND 2GiB 3,3V 8-bit",        0xD5, 0, 2048, 0, LP_OPTIONS},
-       {"NAND 2GiB 1,8V 16-bit",       0xB5, 0, 2048, 0, LP_OPTIONS16},
-       {"NAND 2GiB 3,3V 16-bit",       0xC5, 0, 2048, 0, LP_OPTIONS16},
+       EXTENDED_ID_NAND("NAND 2GiB 1,8V 8-bit",  0xA5, 2048, LP_OPTIONS),
+       EXTENDED_ID_NAND("NAND 2GiB 3,3V 8-bit",  0xD5, 2048, LP_OPTIONS),
+       EXTENDED_ID_NAND("NAND 2GiB 1,8V 16-bit", 0xB5, 2048, LP_OPTIONS16),
+       EXTENDED_ID_NAND("NAND 2GiB 3,3V 16-bit", 0xC5, 2048, LP_OPTIONS16),
  
         /* 32 Gigabit */
-       {"NAND 4GiB 1,8V 8-bit",        0xA7, 0, 4096, 0, LP_OPTIONS},
-       {"NAND 4GiB 3,3V 8-bit",        0xD7, 0, 4096, 0, LP_OPTIONS},
-       {"NAND 4GiB 1,8V 16-bit",       0xB7, 0, 4096, 0, LP_OPTIONS16},
-       {"NAND 4GiB 3,3V 16-bit",       0xC7, 0, 4096, 0, LP_OPTIONS16},
+       EXTENDED_ID_NAND("NAND 4GiB 1,8V 8-bit",  0xA7, 4096, LP_OPTIONS),
+       EXTENDED_ID_NAND("NAND 4GiB 3,3V 8-bit",  0xD7, 4096, LP_OPTIONS),
+       EXTENDED_ID_NAND("NAND 4GiB 1,8V 16-bit", 0xB7, 4096, LP_OPTIONS16),
+       EXTENDED_ID_NAND("NAND 4GiB 3,3V 16-bit", 0xC7, 4096, LP_OPTIONS16),
  
         /* 64 Gigabit */
-       {"NAND 8GiB 1,8V 8-bit",        0xAE, 0, 8192, 0, LP_OPTIONS},
-       {"NAND 8GiB 3,3V 8-bit",        0xDE, 0, 8192, 0, LP_OPTIONS},
-       {"NAND 8GiB 1,8V 16-bit",       0xBE, 0, 8192, 0, LP_OPTIONS16},
-       {"NAND 8GiB 3,3V 16-bit",       0xCE, 0, 8192, 0, LP_OPTIONS16},
+       EXTENDED_ID_NAND("NAND 8GiB 1,8V 8-bit",  0xAE, 8192, LP_OPTIONS),
+       EXTENDED_ID_NAND("NAND 8GiB 3,3V 8-bit",  0xDE, 8192, LP_OPTIONS),
+       EXTENDED_ID_NAND("NAND 8GiB 1,8V 16-bit", 0xBE, 8192, LP_OPTIONS16),
+       EXTENDED_ID_NAND("NAND 8GiB 3,3V 16-bit", 0xCE, 8192, LP_OPTIONS16),
  
         /* 128 Gigabit */
-       {"NAND 16GiB 1,8V 8-bit",       0x1A, 0, 16384, 0, LP_OPTIONS},
-       {"NAND 16GiB 3,3V 8-bit",       0x3A, 0, 16384, 0, LP_OPTIONS},
-       {"NAND 16GiB 1,8V 16-bit",      0x2A, 0, 16384, 0, LP_OPTIONS16},
-       {"NAND 16GiB 3,3V 16-bit",      0x4A, 0, 16384, 0, LP_OPTIONS16},
+       EXTENDED_ID_NAND("NAND 16GiB 1,8V 8-bit",  0x1A, 16384, LP_OPTIONS),
+       EXTENDED_ID_NAND("NAND 16GiB 3,3V 8-bit",  0x3A, 16384, LP_OPTIONS),
+       EXTENDED_ID_NAND("NAND 16GiB 1,8V 16-bit", 0x2A, 16384, LP_OPTIONS16),
+       EXTENDED_ID_NAND("NAND 16GiB 3,3V 16-bit", 0x4A, 16384, LP_OPTIONS16),
  
         /* 256 Gigabit */
-       {"NAND 32GiB 1,8V 8-bit",       0x1C, 0, 32768, 0, LP_OPTIONS},
-       {"NAND 32GiB 3,3V 8-bit",       0x3C, 0, 32768, 0, LP_OPTIONS},
-       {"NAND 32GiB 1,8V 16-bit",      0x2C, 0, 32768, 0, LP_OPTIONS16},
-       {"NAND 32GiB 3,3V 16-bit",      0x4C, 0, 32768, 0, LP_OPTIONS16},
+       EXTENDED_ID_NAND("NAND 32GiB 1,8V 8-bit",  0x1C, 32768, LP_OPTIONS),
+       EXTENDED_ID_NAND("NAND 32GiB 3,3V 8-bit",  0x3C, 32768, LP_OPTIONS),
+       EXTENDED_ID_NAND("NAND 32GiB 1,8V 16-bit", 0x2C, 32768, LP_OPTIONS16),
+       EXTENDED_ID_NAND("NAND 32GiB 3,3V 16-bit", 0x4C, 32768, LP_OPTIONS16),
  
         /* 512 Gigabit */
-       {"NAND 64GiB 1,8V 8-bit",       0x1E, 0, 65536, 0, LP_OPTIONS},
-       {"NAND 64GiB 3,3V 8-bit",       0x3E, 0, 65536, 0, LP_OPTIONS},
-       {"NAND 64GiB 1,8V 16-bit",      0x2E, 0, 65536, 0, LP_OPTIONS16},
-       {"NAND 64GiB 3,3V 16-bit",      0x4E, 0, 65536, 0, LP_OPTIONS16},
+       EXTENDED_ID_NAND("NAND 64GiB 1,8V 8-bit",  0x1E, 65536, LP_OPTIONS),
+       EXTENDED_ID_NAND("NAND 64GiB 3,3V 8-bit",  0x3E, 65536, LP_OPTIONS),
+       EXTENDED_ID_NAND("NAND 64GiB 1,8V 16-bit", 0x2E, 65536, LP_OPTIONS16),
+       EXTENDED_ID_NAND("NAND 64GiB 3,3V 16-bit", 0x4E, 65536, LP_OPTIONS16),
  
-       /*
-        * Renesas AND 1 Gigabit. Those chips do not support extended id and
-        * have a strange page/block layout !  The chosen minimum erasesize is
-        * 4 * 2 * 2048 = 16384 Byte, as those chips have an array of 4 page
-        * planes 1 block = 2 pages, but due to plane arrangement the blocks
-        * 0-3 consists of page 0 + 4,1 + 5, 2 + 6, 3 + 7 Anyway JFFS2 would
-        * increase the eraseblock size so we chose a combined one which can be
-        * erased in one go There are more speed improvements for reads and
-        * writes possible, but not implemented now
-        */
-       {"AND 128MiB 3,3V 8-bit",       0x01, 2048, 128, 0x4000,
-        NAND_IS_AND | NAND_4PAGE_ARRAY | BBT_AUTO_REFRESH},
-
-       {NULL,}
+       {NULL}
  };
  
-/*
-*      Manufacturer ID list
-*/
+/* Manufacturer IDs */
  struct nand_manufacturers nand_manuf_ids[] = {
         {NAND_MFR_TOSHIBA, "Toshiba"},
         {NAND_MFR_SAMSUNG, "Samsung"},
diff --git a/drivers/mtd/nand/nandsim.c b/drivers/mtd/nand/nandsim.c

index 891c52a30e6a48468d593926261ea8eee23260d9..cb38f3d94218b2c977437062f8f402d07941fd6c 100644 (file)
--- a/drivers/mtd/nand/nandsim.c
+++ b/drivers/mtd/nand/nandsim.c
@@ -218,7 +218,6 @@ MODULE_PARM_DESC(bch,                "Enable BCH ecc and set how many bits should "
  #define STATE_CMD_READOOB      0x00000005 /* read OOB area */
  #define STATE_CMD_ERASE1       0x00000006 /* sector erase first command */
  #define STATE_CMD_STATUS       0x00000007 /* read status */
-#define STATE_CMD_STATUS_M     0x00000008 /* read multi-plane status (isn't implemented) */
  #define STATE_CMD_SEQIN        0x00000009 /* sequential data input */
  #define STATE_CMD_READID       0x0000000A /* read ID */
  #define STATE_CMD_ERASE2       0x0000000B /* sector erase second command */
@@ -263,14 +262,13 @@ MODULE_PARM_DESC(bch,              "Enable BCH ecc and set how many bits should "
  #define NS_OPER_STATES   6  /* Maximum number of states in operation */
  
  #define OPT_ANY          0xFFFFFFFF /* any chip supports this operation */
-#define OPT_PAGE256      0x00000001 /* 256-byte  page chips */
  #define OPT_PAGE512      0x00000002 /* 512-byte  page chips */
  #define OPT_PAGE2048     0x00000008 /* 2048-byte page chips */
  #define OPT_SMARTMEDIA   0x00000010 /* SmartMedia technology chips */
  #define OPT_PAGE512_8BIT 0x00000040 /* 512-byte page chips with 8-bit bus width */
  #define OPT_PAGE4096     0x00000080 /* 4096-byte page chips */
  #define OPT_LARGEPAGE    (OPT_PAGE2048 | OPT_PAGE4096) /* 2048 & 4096-byte page chips */
-#define OPT_SMALLPAGE    (OPT_PAGE256  | OPT_PAGE512)  /* 256 and 512-byte page chips */
+#define OPT_SMALLPAGE    (OPT_PAGE512) /* 512-byte page chips */
  
  /* Remove action bits from state */
  #define NS_STATE(x) ((x) & ~ACTION_MASK)
@@ -406,8 +404,6 @@ static struct nandsim_operations {
         {OPT_ANY, {STATE_CMD_ERASE1, STATE_ADDR_SEC, STATE_CMD_ERASE2 | ACTION_SECERASE, STATE_READY}},
         /* Read status */
         {OPT_ANY, {STATE_CMD_STATUS, STATE_DATAOUT_STATUS, STATE_READY}},
-       /* Read multi-plane status */
-       {OPT_SMARTMEDIA, {STATE_CMD_STATUS_M, STATE_DATAOUT_STATUS_M, STATE_READY}},
         /* Read ID */
         {OPT_ANY, {STATE_CMD_READID, STATE_ADDR_ZERO, STATE_DATAOUT_ID, STATE_READY}},
         /* Large page devices read page */
@@ -699,10 +695,7 @@ static int init_nandsim(struct mtd_info *mtd)
         ns->geom.secszoob = ns->geom.secsz + ns->geom.oobsz * ns->geom.pgsec;
         ns->options = 0;
  
-       if (ns->geom.pgsz == 256) {
-               ns->options |= OPT_PAGE256;
-       }
-       else if (ns->geom.pgsz == 512) {
+       if (ns->geom.pgsz == 512) {
                 ns->options |= OPT_PAGE512;
                 if (ns->busw == 8)
                         ns->options |= OPT_PAGE512_8BIT;
@@ -769,9 +762,9 @@ static int init_nandsim(struct mtd_info *mtd)
         }
  
         /* Detect how many ID bytes the NAND chip outputs */
-        for (i = 0; nand_flash_ids[i].name != NULL; i++) {
-                if (second_id_byte != nand_flash_ids[i].id)
-                        continue;
+       for (i = 0; nand_flash_ids[i].name != NULL; i++) {
+               if (second_id_byte != nand_flash_ids[i].dev_id)
+                       continue;
         }
  
         if (ns->busw == 16)
@@ -1079,8 +1072,6 @@ static char *get_state_name(uint32_t state)
                         return "STATE_CMD_ERASE1";
                 case STATE_CMD_STATUS:
                         return "STATE_CMD_STATUS";
-               case STATE_CMD_STATUS_M:
-                       return "STATE_CMD_STATUS_M";
                 case STATE_CMD_SEQIN:
                         return "STATE_CMD_SEQIN";
                 case STATE_CMD_READID:
@@ -1145,7 +1136,6 @@ static int check_command(int cmd)
         case NAND_CMD_RNDOUTSTART:
                 return 0;
  
-       case NAND_CMD_STATUS_MULTI:
         default:
                 return 1;
         }
@@ -1171,8 +1161,6 @@ static uint32_t get_state_by_command(unsigned command)
                         return STATE_CMD_ERASE1;
                 case NAND_CMD_STATUS:
                         return STATE_CMD_STATUS;
-               case NAND_CMD_STATUS_MULTI:
-                       return STATE_CMD_STATUS_M;
                 case NAND_CMD_SEQIN:
                         return STATE_CMD_SEQIN;
                 case NAND_CMD_READID:
@@ -2306,7 +2294,7 @@ static int __init ns_init_module(void)
                 nand->geom.idbytes = 2;
         nand->regs.status = NS_STATUS_OK(nand);
         nand->nxstate = STATE_UNKNOWN;
-       nand->options |= OPT_PAGE256; /* temporary value */
+       nand->options |= OPT_PAGE512; /* temporary value */
         nand->ids[0] = first_id_byte;
         nand->ids[1] = second_id_byte;
         nand->ids[2] = third_id_byte;
diff --git a/drivers/mtd/nand/nuc900_nand.c b/drivers/mtd/nand/nuc900_nand.c

index a6191198d259f02b4f1b14e6437eec5d71ea9a30..cd6be2ed53a86a86a1baca65440adb73183d10a8 100644 (file)
--- a/drivers/mtd/nand/nuc900_nand.c
+++ b/drivers/mtd/nand/nuc900_nand.c
@@ -177,15 +177,6 @@ static void nuc900_nand_command_lp(struct mtd_info *mtd, unsigned int command,
         case NAND_CMD_SEQIN:
         case NAND_CMD_RNDIN:
         case NAND_CMD_STATUS:
-       case NAND_CMD_DEPLETE1:
-               return;
-
-       case NAND_CMD_STATUS_ERROR:
-       case NAND_CMD_STATUS_ERROR0:
-       case NAND_CMD_STATUS_ERROR1:
-       case NAND_CMD_STATUS_ERROR2:
-       case NAND_CMD_STATUS_ERROR3:
-               udelay(chip->chip_delay);
                 return;
  
         case NAND_CMD_RESET:
diff --git a/drivers/mtd/nand/omap2.c b/drivers/mtd/nand/omap2.c

index 8e820ddf4e085ba0091c7238085395b40c4b0c39..81b80af55872a4f2481dd6d3d0c61b762e02ff51 100644 (file)
--- a/drivers/mtd/nand/omap2.c
+++ b/drivers/mtd/nand/omap2.c
@@ -1023,9 +1023,9 @@ static int omap_wait(struct mtd_info *mtd, struct nand_chip *chip)
         int status, state = this->state;
  
         if (state == FL_ERASING)
-               timeo += (HZ * 400) / 1000;
+               timeo += msecs_to_jiffies(400);
         else
-               timeo += (HZ * 20) / 1000;
+               timeo += msecs_to_jiffies(20);
  
         writeb(NAND_CMD_STATUS & 0xFF, info->reg.gpmc_nand_command);
         while (time_before(jiffies, timeo)) {
@@ -1701,8 +1701,9 @@ static int omap3_init_bch(struct mtd_info *mtd, int ecc_opt)
                 elm_node = of_find_node_by_phandle(be32_to_cpup(parp));
                 pdev = of_find_device_by_node(elm_node);
                 info->elm_dev = &pdev->dev;
-               elm_config(info->elm_dev, bch_type);
-               info->is_elm_used = true;
+
+               if (elm_config(info->elm_dev, bch_type) == 0)
+                       info->is_elm_used = true;
         }
  
         if (info->is_elm_used && (mtd->writesize <= 4096)) {
diff --git a/drivers/mtd/nand/orion_nand.c b/drivers/mtd/nand/orion_nand.c

index cd72b9299f6b1af0f3491281cbccf1595f0b2a2b..8fbd002086107f1fff96d26307e9b670b1ca7d95 100644 (file)
--- a/drivers/mtd/nand/orion_nand.c
+++ b/drivers/mtd/nand/orion_nand.c
@@ -231,18 +231,7 @@ static struct platform_driver orion_nand_driver = {
         },
  };
  
-static int __init orion_nand_init(void)
-{
-       return platform_driver_probe(&orion_nand_driver, orion_nand_probe);
-}
-
-static void __exit orion_nand_exit(void)
-{
-       platform_driver_unregister(&orion_nand_driver);
-}
-
-module_init(orion_nand_init);
-module_exit(orion_nand_exit);
+module_platform_driver_probe(orion_nand_driver, orion_nand_probe);
  
  MODULE_LICENSE("GPL");
  MODULE_AUTHOR("Tzachi Perelstein");
diff --git a/drivers/mtd/nand/ppchameleonevb.c b/drivers/mtd/nand/ppchameleonevb.c

deleted file mode 100644 (file)

index 0ddd90e..0000000
--- a/drivers/mtd/nand/ppchameleonevb.c
+++ /dev/null
@@ -1,403 +0,0 @@
-/*
- *  drivers/mtd/nand/ppchameleonevb.c
- *
- *  Copyright (C) 2003 DAVE Srl (info@wawnet.biz)
- *
- *  Derived from drivers/mtd/nand/edb7312.c
- *
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License version 2 as
- * published by the Free Software Foundation.
- *
- *  Overview:
- *   This is a device driver for the NAND flash devices found on the
- *   PPChameleon/PPChameleonEVB system.
- *   PPChameleon options (autodetected):
- *   - BA model: no NAND
- *   - ME model: 32MB (Samsung K9F5608U0B)
- *   - HI model: 128MB (Samsung K9F1G08UOM)
- *   PPChameleonEVB options:
- *   - 32MB (Samsung K9F5608U0B)
- */
-
-#include <linux/init.h>
-#include <linux/slab.h>
-#include <linux/module.h>
-#include <linux/mtd/mtd.h>
-#include <linux/mtd/nand.h>
-#include <linux/mtd/partitions.h>
-#include <asm/io.h>
-#include <platforms/PPChameleonEVB.h>
-
-#undef USE_READY_BUSY_PIN
-#define USE_READY_BUSY_PIN
-/* see datasheets (tR) */
-#define NAND_BIG_DELAY_US              25
-#define NAND_SMALL_DELAY_US            10
-
-/* handy sizes */
-#define SZ_4M                           0x00400000
-#define NAND_SMALL_SIZE                 0x02000000
-#define NAND_MTD_NAME          "ppchameleon-nand"
-#define NAND_EVB_MTD_NAME      "ppchameleonevb-nand"
-
-/* GPIO pins used to drive NAND chip mounted on processor module */
-#define NAND_nCE_GPIO_PIN              (0x80000000 >> 1)
-#define NAND_CLE_GPIO_PIN              (0x80000000 >> 2)
-#define NAND_ALE_GPIO_PIN              (0x80000000 >> 3)
-#define NAND_RB_GPIO_PIN               (0x80000000 >> 4)
-/* GPIO pins used to drive NAND chip mounted on EVB */
-#define NAND_EVB_nCE_GPIO_PIN  (0x80000000 >> 14)
-#define NAND_EVB_CLE_GPIO_PIN  (0x80000000 >> 15)
-#define NAND_EVB_ALE_GPIO_PIN  (0x80000000 >> 16)
-#define NAND_EVB_RB_GPIO_PIN   (0x80000000 >> 31)
-
-/*
- * MTD structure for PPChameleonEVB board
- */
-static struct mtd_info *ppchameleon_mtd = NULL;
-static struct mtd_info *ppchameleonevb_mtd = NULL;
-
-/*
- * Module stuff
- */
-static unsigned long ppchameleon_fio_pbase = CFG_NAND0_PADDR;
-static unsigned long ppchameleonevb_fio_pbase = CFG_NAND1_PADDR;
-
-#ifdef MODULE
-module_param(ppchameleon_fio_pbase, ulong, 0);
-module_param(ppchameleonevb_fio_pbase, ulong, 0);
-#else
-__setup("ppchameleon_fio_pbase=", ppchameleon_fio_pbase);
-__setup("ppchameleonevb_fio_pbase=", ppchameleonevb_fio_pbase);
-#endif
-
-/*
- * Define static partitions for flash devices
- */
-static struct mtd_partition partition_info_hi[] = {
-      { .name = "PPChameleon HI Nand Flash",
-       .offset = 0,
-       .size = 128 * 1024 * 1024
-      }
-};
-
-static struct mtd_partition partition_info_me[] = {
-      { .name = "PPChameleon ME Nand Flash",
-       .offset = 0,
-       .size = 32 * 1024 * 1024
-      }
-};
-
-static struct mtd_partition partition_info_evb[] = {
-      { .name = "PPChameleonEVB Nand Flash",
-       .offset = 0,
-       .size = 32 * 1024 * 1024
-      }
-};
-
-#define NUM_PARTITIONS 1
-
-/*
- *     hardware specific access to control-lines
- */
-static void ppchameleon_hwcontrol(struct mtd_info *mtdinfo, int cmd,
-                                 unsigned int ctrl)
-{
-       struct nand_chip *chip = mtd->priv;
-
-       if (ctrl & NAND_CTRL_CHANGE) {
-#error Missing headerfiles. No way to fix this. -tglx
-               switch (cmd) {
-               case NAND_CTL_SETCLE:
-                       MACRO_NAND_CTL_SETCLE((unsigned long)CFG_NAND0_PADDR);
-                       break;
-               case NAND_CTL_CLRCLE:
-                       MACRO_NAND_CTL_CLRCLE((unsigned long)CFG_NAND0_PADDR);
-                       break;
-               case NAND_CTL_SETALE:
-                       MACRO_NAND_CTL_SETALE((unsigned long)CFG_NAND0_PADDR);
-                       break;
-               case NAND_CTL_CLRALE:
-                       MACRO_NAND_CTL_CLRALE((unsigned long)CFG_NAND0_PADDR);
-                       break;
-               case NAND_CTL_SETNCE:
-                       MACRO_NAND_ENABLE_CE((unsigned long)CFG_NAND0_PADDR);
-                       break;
-               case NAND_CTL_CLRNCE:
-                       MACRO_NAND_DISABLE_CE((unsigned long)CFG_NAND0_PADDR);
-                       break;
-               }
-       }
-       if (cmd != NAND_CMD_NONE)
-               writeb(cmd, chip->IO_ADDR_W);
-}
-
-static void ppchameleonevb_hwcontrol(struct mtd_info *mtdinfo, int cmd,
-                                    unsigned int ctrl)
-{
-       struct nand_chip *chip = mtd->priv;
-
-       if (ctrl & NAND_CTRL_CHANGE) {
-#error Missing headerfiles. No way to fix this. -tglx
-               switch (cmd) {
-               case NAND_CTL_SETCLE:
-                       MACRO_NAND_CTL_SETCLE((unsigned long)CFG_NAND1_PADDR);
-                       break;
-               case NAND_CTL_CLRCLE:
-                       MACRO_NAND_CTL_CLRCLE((unsigned long)CFG_NAND1_PADDR);
-                       break;
-               case NAND_CTL_SETALE:
-                       MACRO_NAND_CTL_SETALE((unsigned long)CFG_NAND1_PADDR);
-                       break;
-               case NAND_CTL_CLRALE:
-                       MACRO_NAND_CTL_CLRALE((unsigned long)CFG_NAND1_PADDR);
-                       break;
-               case NAND_CTL_SETNCE:
-                       MACRO_NAND_ENABLE_CE((unsigned long)CFG_NAND1_PADDR);
-                       break;
-               case NAND_CTL_CLRNCE:
-                       MACRO_NAND_DISABLE_CE((unsigned long)CFG_NAND1_PADDR);
-                       break;
-               }
-       }
-       if (cmd != NAND_CMD_NONE)
-               writeb(cmd, chip->IO_ADDR_W);
-}
-
-#ifdef USE_READY_BUSY_PIN
-/*
- *     read device ready pin
- */
-static int ppchameleon_device_ready(struct mtd_info *minfo)
-{
-       if (in_be32((volatile unsigned *)GPIO0_IR) & NAND_RB_GPIO_PIN)
-               return 1;
-       return 0;
-}
-
-static int ppchameleonevb_device_ready(struct mtd_info *minfo)
-{
-       if (in_be32((volatile unsigned *)GPIO0_IR) & NAND_EVB_RB_GPIO_PIN)
-               return 1;
-       return 0;
-}
-#endif
-
-/*
- * Main initialization routine
- */
-static int __init ppchameleonevb_init(void)
-{
-       struct nand_chip *this;
-       void __iomem *ppchameleon_fio_base;
-       void __iomem *ppchameleonevb_fio_base;
-
-       /*********************************
-       * Processor module NAND (if any) *
-       *********************************/
-       /* Allocate memory for MTD device structure and private data */
-       ppchameleon_mtd = kmalloc(sizeof(struct mtd_info) + sizeof(struct nand_chip), GFP_KERNEL);
-       if (!ppchameleon_mtd) {
-               printk("Unable to allocate PPChameleon NAND MTD device structure.\n");
-               return -ENOMEM;
-       }
-
-       /* map physical address */
-       ppchameleon_fio_base = ioremap(ppchameleon_fio_pbase, SZ_4M);
-       if (!ppchameleon_fio_base) {
-               printk("ioremap PPChameleon NAND flash failed\n");
-               kfree(ppchameleon_mtd);
-               return -EIO;
-       }
-
-       /* Get pointer to private data */
-       this = (struct nand_chip *)(&ppchameleon_mtd[1]);
-
-       /* Initialize structures */
-       memset(ppchameleon_mtd, 0, sizeof(struct mtd_info));
-       memset(this, 0, sizeof(struct nand_chip));
-
-       /* Link the private data with the MTD structure */
-       ppchameleon_mtd->priv = this;
-       ppchameleon_mtd->owner = THIS_MODULE;
-
-       /* Initialize GPIOs */
-       /* Pin mapping for NAND chip */
-       /*
-          CE   GPIO_01
-          CLE  GPIO_02
-          ALE  GPIO_03
-          R/B  GPIO_04
-        */
-       /* output select */
-       out_be32((volatile unsigned *)GPIO0_OSRH, in_be32((volatile unsigned *)GPIO0_OSRH) & 0xC0FFFFFF);
-       /* three-state select */
-       out_be32((volatile unsigned *)GPIO0_TSRH, in_be32((volatile unsigned *)GPIO0_TSRH) & 0xC0FFFFFF);
-       /* enable output driver */
-       out_be32((volatile unsigned *)GPIO0_TCR,
-                in_be32((volatile unsigned *)GPIO0_TCR) | NAND_nCE_GPIO_PIN | NAND_CLE_GPIO_PIN | NAND_ALE_GPIO_PIN);
-#ifdef USE_READY_BUSY_PIN
-       /* three-state select */
-       out_be32((volatile unsigned *)GPIO0_TSRH, in_be32((volatile unsigned *)GPIO0_TSRH) & 0xFF3FFFFF);
-       /* high-impedecence */
-       out_be32((volatile unsigned *)GPIO0_TCR, in_be32((volatile unsigned *)GPIO0_TCR) & (~NAND_RB_GPIO_PIN));
-       /* input select */
-       out_be32((volatile unsigned *)GPIO0_ISR1H,
-                (in_be32((volatile unsigned *)GPIO0_ISR1H) & 0xFF3FFFFF) | 0x00400000);
-#endif
-
-       /* insert callbacks */
-       this->IO_ADDR_R = ppchameleon_fio_base;
-       this->IO_ADDR_W = ppchameleon_fio_base;
-       this->cmd_ctrl = ppchameleon_hwcontrol;
-#ifdef USE_READY_BUSY_PIN
-       this->dev_ready = ppchameleon_device_ready;
-#endif
-       this->chip_delay = NAND_BIG_DELAY_US;
-       /* ECC mode */
-       this->ecc.mode = NAND_ECC_SOFT;
-
-       /* Scan to find existence of the device (it could not be mounted) */
-       if (nand_scan(ppchameleon_mtd, 1)) {
-               iounmap((void *)ppchameleon_fio_base);
-               ppchameleon_fio_base = NULL;
-               kfree(ppchameleon_mtd);
-               goto nand_evb_init;
-       }
-#ifndef USE_READY_BUSY_PIN
-       /* Adjust delay if necessary */
-       if (ppchameleon_mtd->size == NAND_SMALL_SIZE)
-               this->chip_delay = NAND_SMALL_DELAY_US;
-#endif
-
-       ppchameleon_mtd->name = "ppchameleon-nand";
-
-       /* Register the partitions */
-       mtd_device_parse_register(ppchameleon_mtd, NULL, NULL,
-                                 ppchameleon_mtd->size == NAND_SMALL_SIZE ?
-                                       partition_info_me : partition_info_hi,
-                                 NUM_PARTITIONS);
-
- nand_evb_init:
-       /****************************
-       * EVB NAND (always present) *
-       ****************************/
-       /* Allocate memory for MTD device structure and private data */
-       ppchameleonevb_mtd = kmalloc(sizeof(struct mtd_info) + sizeof(struct nand_chip), GFP_KERNEL);
-       if (!ppchameleonevb_mtd) {
-               printk("Unable to allocate PPChameleonEVB NAND MTD device structure.\n");
-               if (ppchameleon_fio_base)
-                       iounmap(ppchameleon_fio_base);
-               return -ENOMEM;
-       }
-
-       /* map physical address */
-       ppchameleonevb_fio_base = ioremap(ppchameleonevb_fio_pbase, SZ_4M);
-       if (!ppchameleonevb_fio_base) {
-               printk("ioremap PPChameleonEVB NAND flash failed\n");
-               kfree(ppchameleonevb_mtd);
-               if (ppchameleon_fio_base)
-                       iounmap(ppchameleon_fio_base);
-               return -EIO;
-       }
-
-       /* Get pointer to private data */
-       this = (struct nand_chip *)(&ppchameleonevb_mtd[1]);
-
-       /* Initialize structures */
-       memset(ppchameleonevb_mtd, 0, sizeof(struct mtd_info));
-       memset(this, 0, sizeof(struct nand_chip));
-
-       /* Link the private data with the MTD structure */
-       ppchameleonevb_mtd->priv = this;
-
-       /* Initialize GPIOs */
-       /* Pin mapping for NAND chip */
-       /*
-          CE   GPIO_14
-          CLE  GPIO_15
-          ALE  GPIO_16
-          R/B  GPIO_31
-        */
-       /* output select */
-       out_be32((volatile unsigned *)GPIO0_OSRH, in_be32((volatile unsigned *)GPIO0_OSRH) & 0xFFFFFFF0);
-       out_be32((volatile unsigned *)GPIO0_OSRL, in_be32((volatile unsigned *)GPIO0_OSRL) & 0x3FFFFFFF);
-       /* three-state select */
-       out_be32((volatile unsigned *)GPIO0_TSRH, in_be32((volatile unsigned *)GPIO0_TSRH) & 0xFFFFFFF0);
-       out_be32((volatile unsigned *)GPIO0_TSRL, in_be32((volatile unsigned *)GPIO0_TSRL) & 0x3FFFFFFF);
-       /* enable output driver */
-       out_be32((volatile unsigned *)GPIO0_TCR, in_be32((volatile unsigned *)GPIO0_TCR) | NAND_EVB_nCE_GPIO_PIN |
-                NAND_EVB_CLE_GPIO_PIN | NAND_EVB_ALE_GPIO_PIN);
-#ifdef USE_READY_BUSY_PIN
-       /* three-state select */
-       out_be32((volatile unsigned *)GPIO0_TSRL, in_be32((volatile unsigned *)GPIO0_TSRL) & 0xFFFFFFFC);
-       /* high-impedecence */
-       out_be32((volatile unsigned *)GPIO0_TCR, in_be32((volatile unsigned *)GPIO0_TCR) & (~NAND_EVB_RB_GPIO_PIN));
-       /* input select */
-       out_be32((volatile unsigned *)GPIO0_ISR1L,
-                (in_be32((volatile unsigned *)GPIO0_ISR1L) & 0xFFFFFFFC) | 0x00000001);
-#endif
-
-       /* insert callbacks */
-       this->IO_ADDR_R = ppchameleonevb_fio_base;
-       this->IO_ADDR_W = ppchameleonevb_fio_base;
-       this->cmd_ctrl = ppchameleonevb_hwcontrol;
-#ifdef USE_READY_BUSY_PIN
-       this->dev_ready = ppchameleonevb_device_ready;
-#endif
-       this->chip_delay = NAND_SMALL_DELAY_US;
-
-       /* ECC mode */
-       this->ecc.mode = NAND_ECC_SOFT;
-
-       /* Scan to find existence of the device */
-       if (nand_scan(ppchameleonevb_mtd, 1)) {
-               iounmap((void *)ppchameleonevb_fio_base);
-               kfree(ppchameleonevb_mtd);
-               if (ppchameleon_fio_base)
-                       iounmap(ppchameleon_fio_base);
-               return -ENXIO;
-       }
-
-       ppchameleonevb_mtd->name = NAND_EVB_MTD_NAME;
-
-       /* Register the partitions */
-       mtd_device_parse_register(ppchameleonevb_mtd, NULL, NULL,
-                                 ppchameleon_mtd->size == NAND_SMALL_SIZE ?
-                                 partition_info_me : partition_info_hi,
-                                 NUM_PARTITIONS);
-
-       /* Return happy */
-       return 0;
-}
-
-module_init(ppchameleonevb_init);
-
-/*
- * Clean up routine
- */
-static void __exit ppchameleonevb_cleanup(void)
-{
-       struct nand_chip *this;
-
-       /* Release resources, unregister device(s) */
-       nand_release(ppchameleon_mtd);
-       nand_release(ppchameleonevb_mtd);
-
-       /* Release iomaps */
-       this = (struct nand_chip *) &ppchameleon_mtd[1];
-       iounmap((void *) this->IO_ADDR_R);
-       this = (struct nand_chip *) &ppchameleonevb_mtd[1];
-       iounmap((void *) this->IO_ADDR_R);
-
-       /* Free the MTD device structure */
-       kfree (ppchameleon_mtd);
-       kfree (ppchameleonevb_mtd);
-}
-module_exit(ppchameleonevb_cleanup);
-
-MODULE_LICENSE("GPL");
-MODULE_AUTHOR("DAVE Srl <support-ppchameleon@dave-tech.it>");
-MODULE_DESCRIPTION("MTD map driver for DAVE Srl PPChameleonEVB board");
diff --git a/drivers/mtd/nand/pxa3xx_nand.c b/drivers/mtd/nand/pxa3xx_nand.c

index 37ee75c7bacb9d3200925a56a8caa9490c913dda..dec80ca6a5ce58dbd187690ea42fc706c9dc9325 100644 (file)
--- a/drivers/mtd/nand/pxa3xx_nand.c
+++ b/drivers/mtd/nand/pxa3xx_nand.c
@@ -989,7 +989,7 @@ static int pxa3xx_nand_scan(struct mtd_info *mtd)
         }
  
         pxa3xx_flash_ids[0].name = f->name;
-       pxa3xx_flash_ids[0].id = (f->chip_id >> 8) & 0xffff;
+       pxa3xx_flash_ids[0].dev_id = (f->chip_id >> 8) & 0xffff;
         pxa3xx_flash_ids[0].pagesize = f->page_size;
         chipsize = (uint64_t)f->num_blocks * f->page_per_block * f->page_size;
         pxa3xx_flash_ids[0].chipsize = chipsize >> 20;
diff --git a/drivers/mtd/nand/rtc_from4.c b/drivers/mtd/nand/rtc_from4.c

deleted file mode 100644 (file)

index e55b5cf..0000000
--- a/drivers/mtd/nand/rtc_from4.c
+++ /dev/null
@@ -1,624 +0,0 @@
-/*
- *  drivers/mtd/nand/rtc_from4.c
- *
- *  Copyright (C) 2004  Red Hat, Inc.
- *
- *  Derived from drivers/mtd/nand/spia.c
- *       Copyright (C) 2000 Steven J. Hill (sjhill@realitydiluted.com)
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License version 2 as
- * published by the Free Software Foundation.
- *
- * Overview:
- *   This is a device driver for the AG-AND flash device found on the
- *   Renesas Technology Corp. Flash ROM 4-slot interface board (FROM_BOARD4),
- *   which utilizes the Renesas HN29V1G91T-30 part.
- *   This chip is a 1 GBibit (128MiB x 8 bits) AG-AND flash device.
- */
-
-#include <linux/delay.h>
-#include <linux/kernel.h>
-#include <linux/init.h>
-#include <linux/slab.h>
-#include <linux/rslib.h>
-#include <linux/bitrev.h>
-#include <linux/module.h>
-#include <linux/mtd/mtd.h>
-#include <linux/mtd/nand.h>
-#include <linux/mtd/partitions.h>
-#include <asm/io.h>
-
-/*
- * MTD structure for Renesas board
- */
-static struct mtd_info *rtc_from4_mtd = NULL;
-
-#define RTC_FROM4_MAX_CHIPS    2
-
-/* HS77x9 processor register defines */
-#define SH77X9_BCR1    ((volatile unsigned short *)(0xFFFFFF60))
-#define SH77X9_BCR2    ((volatile unsigned short *)(0xFFFFFF62))
-#define SH77X9_WCR1    ((volatile unsigned short *)(0xFFFFFF64))
-#define SH77X9_WCR2    ((volatile unsigned short *)(0xFFFFFF66))
-#define SH77X9_MCR     ((volatile unsigned short *)(0xFFFFFF68))
-#define SH77X9_PCR     ((volatile unsigned short *)(0xFFFFFF6C))
-#define SH77X9_FRQCR   ((volatile unsigned short *)(0xFFFFFF80))
-
-/*
- * Values specific to the Renesas Technology Corp. FROM_BOARD4 (used with HS77x9 processor)
- */
-/* Address where flash is mapped */
-#define RTC_FROM4_FIO_BASE     0x14000000
-
-/* CLE and ALE are tied to address lines 5 & 4, respectively */
-#define RTC_FROM4_CLE          (1 << 5)
-#define RTC_FROM4_ALE          (1 << 4)
-
-/* address lines A24-A22 used for chip selection */
-#define RTC_FROM4_NAND_ADDR_SLOT3      (0x00800000)
-#define RTC_FROM4_NAND_ADDR_SLOT4      (0x00C00000)
-#define RTC_FROM4_NAND_ADDR_FPGA       (0x01000000)
-/* mask address lines A24-A22 used for chip selection */
-#define RTC_FROM4_NAND_ADDR_MASK       (RTC_FROM4_NAND_ADDR_SLOT3 | RTC_FROM4_NAND_ADDR_SLOT4 | RTC_FROM4_NAND_ADDR_FPGA)
-
-/* FPGA status register for checking device ready (bit zero) */
-#define RTC_FROM4_FPGA_SR              (RTC_FROM4_NAND_ADDR_FPGA | 0x00000002)
-#define RTC_FROM4_DEVICE_READY         0x0001
-
-/* FPGA Reed-Solomon ECC Control register */
-
-#define RTC_FROM4_RS_ECC_CTL           (RTC_FROM4_NAND_ADDR_FPGA | 0x00000050)
-#define RTC_FROM4_RS_ECC_CTL_CLR       (1 << 7)
-#define RTC_FROM4_RS_ECC_CTL_GEN       (1 << 6)
-#define RTC_FROM4_RS_ECC_CTL_FD_E      (1 << 5)
-
-/* FPGA Reed-Solomon ECC code base */
-#define RTC_FROM4_RS_ECC               (RTC_FROM4_NAND_ADDR_FPGA | 0x00000060)
-#define RTC_FROM4_RS_ECCN              (RTC_FROM4_NAND_ADDR_FPGA | 0x00000080)
-
-/* FPGA Reed-Solomon ECC check register */
-#define RTC_FROM4_RS_ECC_CHK           (RTC_FROM4_NAND_ADDR_FPGA | 0x00000070)
-#define RTC_FROM4_RS_ECC_CHK_ERROR     (1 << 7)
-
-#define ERR_STAT_ECC_AVAILABLE         0x20
-
-/* Undefine for software ECC */
-#define RTC_FROM4_HWECC        1
-
-/* Define as 1 for no virtual erase blocks (in JFFS2) */
-#define RTC_FROM4_NO_VIRTBLOCKS        0
-
-/*
- * Module stuff
- */
-static void __iomem *rtc_from4_fio_base = (void *)P2SEGADDR(RTC_FROM4_FIO_BASE);
-
-static const struct mtd_partition partition_info[] = {
-       {
-        .name = "Renesas flash partition 1",
-        .offset = 0,
-        .size = MTDPART_SIZ_FULL},
-};
-
-#define NUM_PARTITIONS 1
-
-/*
- *     hardware specific flash bbt decriptors
- *     Note: this is to allow debugging by disabling
- *             NAND_BBT_CREATE and/or NAND_BBT_WRITE
- *
- */
-static uint8_t bbt_pattern[] = { 'B', 'b', 't', '0' };
-static uint8_t mirror_pattern[] = { '1', 't', 'b', 'B' };
-
-static struct nand_bbt_descr rtc_from4_bbt_main_descr = {
-       .options = NAND_BBT_LASTBLOCK | NAND_BBT_CREATE | NAND_BBT_WRITE
-               | NAND_BBT_2BIT | NAND_BBT_VERSION | NAND_BBT_PERCHIP,
-       .offs = 40,
-       .len = 4,
-       .veroffs = 44,
-       .maxblocks = 4,
-       .pattern = bbt_pattern
-};
-
-static struct nand_bbt_descr rtc_from4_bbt_mirror_descr = {
-       .options = NAND_BBT_LASTBLOCK | NAND_BBT_CREATE | NAND_BBT_WRITE
-               | NAND_BBT_2BIT | NAND_BBT_VERSION | NAND_BBT_PERCHIP,
-       .offs = 40,
-       .len = 4,
-       .veroffs = 44,
-       .maxblocks = 4,
-       .pattern = mirror_pattern
-};
-
-#ifdef RTC_FROM4_HWECC
-
-/* the Reed Solomon control structure */
-static struct rs_control *rs_decoder;
-
-/*
- *      hardware specific Out Of Band information
- */
-static struct nand_ecclayout rtc_from4_nand_oobinfo = {
-       .eccbytes = 32,
-       .eccpos = {
-                  0, 1, 2, 3, 4, 5, 6, 7,
-                  8, 9, 10, 11, 12, 13, 14, 15,
-                  16, 17, 18, 19, 20, 21, 22, 23,
-                  24, 25, 26, 27, 28, 29, 30, 31},
-       .oobfree = {{32, 32}}
-};
-
-#endif
-
-/*
- * rtc_from4_hwcontrol - hardware specific access to control-lines
- * @mtd:       MTD device structure
- * @cmd:       hardware control command
- *
- * Address lines (A5 and A4) are used to control Command and Address Latch
- * Enable on this board, so set the read/write address appropriately.
- *
- * Chip Enable is also controlled by the Chip Select (CS5) and
- * Address lines (A24-A22), so no action is required here.
- *
- */
-static void rtc_from4_hwcontrol(struct mtd_info *mtd, int cmd,
-                               unsigned int ctrl)
-{
-       struct nand_chip *chip = (mtd->priv);
-
-       if (cmd == NAND_CMD_NONE)
-               return;
-
-       if (ctrl & NAND_CLE)
-               writeb(cmd, chip->IO_ADDR_W | RTC_FROM4_CLE);
-       else
-               writeb(cmd, chip->IO_ADDR_W | RTC_FROM4_ALE);
-}
-
-/*
- * rtc_from4_nand_select_chip - hardware specific chip select
- * @mtd:       MTD device structure
- * @chip:      Chip to select (0 == slot 3, 1 == slot 4)
- *
- * The chip select is based on address lines A24-A22.
- * This driver uses flash slots 3 and 4 (A23-A22).
- *
- */
-static void rtc_from4_nand_select_chip(struct mtd_info *mtd, int chip)
-{
-       struct nand_chip *this = mtd->priv;
-
-       this->IO_ADDR_R = (void __iomem *)((unsigned long)this->IO_ADDR_R & ~RTC_FROM4_NAND_ADDR_MASK);
-       this->IO_ADDR_W = (void __iomem *)((unsigned long)this->IO_ADDR_W & ~RTC_FROM4_NAND_ADDR_MASK);
-
-       switch (chip) {
-
-       case 0:         /* select slot 3 chip */
-               this->IO_ADDR_R = (void __iomem *)((unsigned long)this->IO_ADDR_R | RTC_FROM4_NAND_ADDR_SLOT3);
-               this->IO_ADDR_W = (void __iomem *)((unsigned long)this->IO_ADDR_W | RTC_FROM4_NAND_ADDR_SLOT3);
-               break;
-       case 1:         /* select slot 4 chip */
-               this->IO_ADDR_R = (void __iomem *)((unsigned long)this->IO_ADDR_R | RTC_FROM4_NAND_ADDR_SLOT4);
-               this->IO_ADDR_W = (void __iomem *)((unsigned long)this->IO_ADDR_W | RTC_FROM4_NAND_ADDR_SLOT4);
-               break;
-
-       }
-}
-
-/*
- * rtc_from4_nand_device_ready - hardware specific ready/busy check
- * @mtd:       MTD device structure
- *
- * This board provides the Ready/Busy state in the status register
- * of the FPGA.  Bit zero indicates the RDY(1)/BSY(0) signal.
- *
- */
-static int rtc_from4_nand_device_ready(struct mtd_info *mtd)
-{
-       unsigned short status;
-
-       status = *((volatile unsigned short *)(rtc_from4_fio_base + RTC_FROM4_FPGA_SR));
-
-       return (status & RTC_FROM4_DEVICE_READY);
-
-}
-
-/*
- * deplete - code to perform device recovery in case there was a power loss
- * @mtd:       MTD device structure
- * @chip:      Chip to select (0 == slot 3, 1 == slot 4)
- *
- * If there was a sudden loss of power during an erase operation, a
- * "device recovery" operation must be performed when power is restored
- * to ensure correct operation.  This routine performs the required steps
- * for the requested chip.
- *
- * See page 86 of the data sheet for details.
- *
- */
-static void deplete(struct mtd_info *mtd, int chip)
-{
-       struct nand_chip *this = mtd->priv;
-
-       /* wait until device is ready */
-       while (!this->dev_ready(mtd)) ;
-
-       this->select_chip(mtd, chip);
-
-       /* Send the commands for device recovery, phase 1 */
-       this->cmdfunc(mtd, NAND_CMD_DEPLETE1, 0x0000, 0x0000);
-       this->cmdfunc(mtd, NAND_CMD_DEPLETE2, -1, -1);
-
-       /* Send the commands for device recovery, phase 2 */
-       this->cmdfunc(mtd, NAND_CMD_DEPLETE1, 0x0000, 0x0004);
-       this->cmdfunc(mtd, NAND_CMD_DEPLETE2, -1, -1);
-
-}
-
-#ifdef RTC_FROM4_HWECC
-/*
- * rtc_from4_enable_hwecc - hardware specific hardware ECC enable function
- * @mtd:       MTD device structure
- * @mode:      I/O mode; read or write
- *
- * enable hardware ECC for data read or write
- *
- */
-static void rtc_from4_enable_hwecc(struct mtd_info *mtd, int mode)
-{
-       volatile unsigned short *rs_ecc_ctl = (volatile unsigned short *)(rtc_from4_fio_base + RTC_FROM4_RS_ECC_CTL);
-       unsigned short status;
-
-       switch (mode) {
-       case NAND_ECC_READ:
-               status = RTC_FROM4_RS_ECC_CTL_CLR | RTC_FROM4_RS_ECC_CTL_FD_E;
-
-               *rs_ecc_ctl = status;
-               break;
-
-       case NAND_ECC_READSYN:
-               status = 0x00;
-
-               *rs_ecc_ctl = status;
-               break;
-
-       case NAND_ECC_WRITE:
-               status = RTC_FROM4_RS_ECC_CTL_CLR | RTC_FROM4_RS_ECC_CTL_GEN | RTC_FROM4_RS_ECC_CTL_FD_E;
-
-               *rs_ecc_ctl = status;
-               break;
-
-       default:
-               BUG();
-               break;
-       }
-
-}
-
-/*
- * rtc_from4_calculate_ecc - hardware specific code to read ECC code
- * @mtd:       MTD device structure
- * @dat:       buffer containing the data to generate ECC codes
- * @ecc_code   ECC codes calculated
- *
- * The ECC code is calculated by the FPGA.  All we have to do is read the values
- * from the FPGA registers.
- *
- * Note: We read from the inverted registers, since data is inverted before
- * the code is calculated. So all 0xff data (blank page) results in all 0xff rs code
- *
- */
-static void rtc_from4_calculate_ecc(struct mtd_info *mtd, const u_char *dat, u_char *ecc_code)
-{
-       volatile unsigned short *rs_eccn = (volatile unsigned short *)(rtc_from4_fio_base + RTC_FROM4_RS_ECCN);
-       unsigned short value;
-       int i;
-
-       for (i = 0; i < 8; i++) {
-               value = *rs_eccn;
-               ecc_code[i] = (unsigned char)value;
-               rs_eccn++;
-       }
-       ecc_code[7] |= 0x0f;    /* set the last four bits (not used) */
-}
-
-/*
- * rtc_from4_correct_data - hardware specific code to correct data using ECC code
- * @mtd:       MTD device structure
- * @buf:       buffer containing the data to generate ECC codes
- * @ecc1       ECC codes read
- * @ecc2       ECC codes calculated
- *
- * The FPGA tells us fast, if there's an error or not. If no, we go back happy
- * else we read the ecc results from the fpga and call the rs library to decode
- * and hopefully correct the error.
- *
- */
-static int rtc_from4_correct_data(struct mtd_info *mtd, const u_char *buf, u_char *ecc1, u_char *ecc2)
-{
-       int i, j, res;
-       unsigned short status;
-       uint16_t par[6], syn[6];
-       uint8_t ecc[8];
-       volatile unsigned short *rs_ecc;
-
-       status = *((volatile unsigned short *)(rtc_from4_fio_base + RTC_FROM4_RS_ECC_CHK));
-
-       if (!(status & RTC_FROM4_RS_ECC_CHK_ERROR)) {
-               return 0;
-       }
-
-       /* Read the syndrome pattern from the FPGA and correct the bitorder */
-       rs_ecc = (volatile unsigned short *)(rtc_from4_fio_base + RTC_FROM4_RS_ECC);
-       for (i = 0; i < 8; i++) {
-               ecc[i] = bitrev8(*rs_ecc);
-               rs_ecc++;
-       }
-
-       /* convert into 6 10bit syndrome fields */
-       par[5] = rs_decoder->index_of[(((uint16_t) ecc[0] >> 0) & 0x0ff) | (((uint16_t) ecc[1] << 8) & 0x300)];
-       par[4] = rs_decoder->index_of[(((uint16_t) ecc[1] >> 2) & 0x03f) | (((uint16_t) ecc[2] << 6) & 0x3c0)];
-       par[3] = rs_decoder->index_of[(((uint16_t) ecc[2] >> 4) & 0x00f) | (((uint16_t) ecc[3] << 4) & 0x3f0)];
-       par[2] = rs_decoder->index_of[(((uint16_t) ecc[3] >> 6) & 0x003) | (((uint16_t) ecc[4] << 2) & 0x3fc)];
-       par[1] = rs_decoder->index_of[(((uint16_t) ecc[5] >> 0) & 0x0ff) | (((uint16_t) ecc[6] << 8) & 0x300)];
-       par[0] = (((uint16_t) ecc[6] >> 2) & 0x03f) | (((uint16_t) ecc[7] << 6) & 0x3c0);
-
-       /* Convert to computable syndrome */
-       for (i = 0; i < 6; i++) {
-               syn[i] = par[0];
-               for (j = 1; j < 6; j++)
-                       if (par[j] != rs_decoder->nn)
-                               syn[i] ^= rs_decoder->alpha_to[rs_modnn(rs_decoder, par[j] + i * j)];
-
-               /* Convert to index form */
-               syn[i] = rs_decoder->index_of[syn[i]];
-       }
-
-       /* Let the library code do its magic. */
-       res = decode_rs8(rs_decoder, (uint8_t *) buf, par, 512, syn, 0, NULL, 0xff, NULL);
-       if (res > 0) {
-               pr_debug("rtc_from4_correct_data: " "ECC corrected %d errors on read\n", res);
-       }
-       return res;
-}
-
-/**
- * rtc_from4_errstat - perform additional error status checks
- * @mtd:       MTD device structure
- * @this:      NAND chip structure
- * @state:     state or the operation
- * @status:    status code returned from read status
- * @page:      startpage inside the chip, must be called with (page & this->pagemask)
- *
- * Perform additional error status checks on erase and write failures
- * to determine if errors are correctable.  For this device, correctable
- * 1-bit errors on erase and write are considered acceptable.
- *
- * note: see pages 34..37 of data sheet for details.
- *
- */
-static int rtc_from4_errstat(struct mtd_info *mtd, struct nand_chip *this,
-                            int state, int status, int page)
-{
-       int er_stat = 0;
-       int rtn, retlen;
-       size_t len;
-       uint8_t *buf;
-       int i;
-
-       this->cmdfunc(mtd, NAND_CMD_STATUS_CLEAR, -1, -1);
-
-       if (state == FL_ERASING) {
-
-               for (i = 0; i < 4; i++) {
-                       if (!(status & 1 << (i + 1)))
-                               continue;
-                       this->cmdfunc(mtd, (NAND_CMD_STATUS_ERROR + i + 1),
-                                     -1, -1);
-                       rtn = this->read_byte(mtd);
-                       this->cmdfunc(mtd, NAND_CMD_STATUS_RESET, -1, -1);
-
-                       /* err_ecc_not_avail */
-                       if (!(rtn & ERR_STAT_ECC_AVAILABLE))
-                               er_stat |= 1 << (i + 1);
-               }
-
-       } else if (state == FL_WRITING) {
-
-               unsigned long corrected = mtd->ecc_stats.corrected;
-
-               /* single bank write logic */
-               this->cmdfunc(mtd, NAND_CMD_STATUS_ERROR, -1, -1);
-               rtn = this->read_byte(mtd);
-               this->cmdfunc(mtd, NAND_CMD_STATUS_RESET, -1, -1);
-
-               if (!(rtn & ERR_STAT_ECC_AVAILABLE)) {
-                       /* err_ecc_not_avail */
-                       er_stat |= 1 << 1;
-                       goto out;
-               }
-
-               len = mtd->writesize;
-               buf = kmalloc(len, GFP_KERNEL);
-               if (!buf) {
-                       er_stat = 1;
-                       goto out;
-               }
-
-               /* recovery read */
-               rtn = nand_do_read(mtd, page, len, &retlen, buf);
-
-               /* if read failed or > 1-bit error corrected */
-               if (rtn || (mtd->ecc_stats.corrected - corrected) > 1)
-                       er_stat |= 1 << 1;
-               kfree(buf);
-       }
-out:
-       rtn = status;
-       if (er_stat == 0) {     /* if ECC is available   */
-               rtn = (status & ~NAND_STATUS_FAIL);     /*   clear the error bit */
-       }
-
-       return rtn;
-}
-#endif
-
-/*
- * Main initialization routine
- */
-static int __init rtc_from4_init(void)
-{
-       struct nand_chip *this;
-       unsigned short bcr1, bcr2, wcr2;
-       int i;
-       int ret;
-
-       /* Allocate memory for MTD device structure and private data */
-       rtc_from4_mtd = kmalloc(sizeof(struct mtd_info) + sizeof(struct nand_chip), GFP_KERNEL);
-       if (!rtc_from4_mtd) {
-               printk("Unable to allocate Renesas NAND MTD device structure.\n");
-               return -ENOMEM;
-       }
-
-       /* Get pointer to private data */
-       this = (struct nand_chip *)(&rtc_from4_mtd[1]);
-
-       /* Initialize structures */
-       memset(rtc_from4_mtd, 0, sizeof(struct mtd_info));
-       memset(this, 0, sizeof(struct nand_chip));
-
-       /* Link the private data with the MTD structure */
-       rtc_from4_mtd->priv = this;
-       rtc_from4_mtd->owner = THIS_MODULE;
-
-       /* set area 5 as PCMCIA mode to clear the spec of tDH(Data hold time;9ns min) */
-       bcr1 = *SH77X9_BCR1 & ~0x0002;
-       bcr1 |= 0x0002;
-       *SH77X9_BCR1 = bcr1;
-
-       /* set */
-       bcr2 = *SH77X9_BCR2 & ~0x0c00;
-       bcr2 |= 0x0800;
-       *SH77X9_BCR2 = bcr2;
-
-       /* set area 5 wait states */
-       wcr2 = *SH77X9_WCR2 & ~0x1c00;
-       wcr2 |= 0x1c00;
-       *SH77X9_WCR2 = wcr2;
-
-       /* Set address of NAND IO lines */
-       this->IO_ADDR_R = rtc_from4_fio_base;
-       this->IO_ADDR_W = rtc_from4_fio_base;
-       /* Set address of hardware control function */
-       this->cmd_ctrl = rtc_from4_hwcontrol;
-       /* Set address of chip select function */
-       this->select_chip = rtc_from4_nand_select_chip;
-       /* command delay time (in us) */
-       this->chip_delay = 100;
-       /* return the status of the Ready/Busy line */
-       this->dev_ready = rtc_from4_nand_device_ready;
-
-#ifdef RTC_FROM4_HWECC
-       printk(KERN_INFO "rtc_from4_init: using hardware ECC detection.\n");
-
-       this->ecc.mode = NAND_ECC_HW_SYNDROME;
-       this->ecc.size = 512;
-       this->ecc.bytes = 8;
-       this->ecc.strength = 3;
-       /* return the status of extra status and ECC checks */
-       this->errstat = rtc_from4_errstat;
-       /* set the nand_oobinfo to support FPGA H/W error detection */
-       this->ecc.layout = &rtc_from4_nand_oobinfo;
-       this->ecc.hwctl = rtc_from4_enable_hwecc;
-       this->ecc.calculate = rtc_from4_calculate_ecc;
-       this->ecc.correct = rtc_from4_correct_data;
-
-       /* We could create the decoder on demand, if memory is a concern.
-        * This way we have it handy, if an error happens
-        *
-        * Symbolsize is 10 (bits)
-        * Primitve polynomial is x^10+x^3+1
-        * first consecutive root is 0
-        * primitve element to generate roots = 1
-        * generator polinomial degree = 6
-        */
-       rs_decoder = init_rs(10, 0x409, 0, 1, 6);
-       if (!rs_decoder) {
-               printk(KERN_ERR "Could not create a RS decoder\n");
-               ret = -ENOMEM;
-               goto err_1;
-       }
-#else
-       printk(KERN_INFO "rtc_from4_init: using software ECC detection.\n");
-
-       this->ecc.mode = NAND_ECC_SOFT;
-#endif
-
-       /* set the bad block tables to support debugging */
-       this->bbt_td = &rtc_from4_bbt_main_descr;
-       this->bbt_md = &rtc_from4_bbt_mirror_descr;
-
-       /* Scan to find existence of the device */
-       if (nand_scan(rtc_from4_mtd, RTC_FROM4_MAX_CHIPS)) {
-               ret = -ENXIO;
-               goto err_2;
-       }
-
-       /* Perform 'device recovery' for each chip in case there was a power loss. */
-       for (i = 0; i < this->numchips; i++) {
-               deplete(rtc_from4_mtd, i);
-       }
-
-#if RTC_FROM4_NO_VIRTBLOCKS
-       /* use a smaller erase block to minimize wasted space when a block is bad */
-       /* note: this uses eight times as much RAM as using the default and makes */
-       /*       mounts take four times as long. */
-       rtc_from4_mtd->flags |= MTD_NO_VIRTBLOCKS;
-#endif
-
-       /* Register the partitions */
-       ret = mtd_device_register(rtc_from4_mtd, partition_info,
-                                 NUM_PARTITIONS);
-       if (ret)
-               goto err_3;
-
-       /* Return happy */
-       return 0;
-err_3:
-       nand_release(rtc_from4_mtd);
-err_2:
-       free_rs(rs_decoder);
-err_1:
-       kfree(rtc_from4_mtd);
-       return ret;
-}
-
-module_init(rtc_from4_init);
-
-/*
- * Clean up routine
- */
-static void __exit rtc_from4_cleanup(void)
-{
-       /* Release resource, unregister partitions */
-       nand_release(rtc_from4_mtd);
-
-       /* Free the MTD device structure */
-       kfree(rtc_from4_mtd);
-
-#ifdef RTC_FROM4_HWECC
-       /* Free the reed solomon resources */
-       if (rs_decoder) {
-               free_rs(rs_decoder);
-       }
-#endif
-}
-
-module_exit(rtc_from4_cleanup);
-
-MODULE_LICENSE("GPL");
-MODULE_AUTHOR("d.marlin <dmarlin@redhat.com");
-MODULE_DESCRIPTION("Board-specific glue layer for AG-AND flash on Renesas FROM_BOARD4");
diff --git a/drivers/mtd/nand/sh_flctl.c b/drivers/mtd/nand/sh_flctl.c

index 57b3971c9c0a609a89284bef3e32b977466f6662..e57e18e8c2893ab8077693e13385bb06ddaad9af 100644 (file)
--- a/drivers/mtd/nand/sh_flctl.c
+++ b/drivers/mtd/nand/sh_flctl.c
@@ -1081,7 +1081,6 @@ static struct sh_flctl_platform_data *flctl_parse_dt(struct device *dev)
         return pdata;
  }
  #else /* CONFIG_OF */
-#define of_flctl_match NULL
  static struct sh_flctl_platform_data *flctl_parse_dt(struct device *dev)
  {
         return NULL;
@@ -1219,22 +1218,11 @@ static struct platform_driver flctl_driver = {
         .driver = {
                 .name   = "sh_flctl",
                 .owner  = THIS_MODULE,
-               .of_match_table = of_flctl_match,
+               .of_match_table = of_match_ptr(of_flctl_match),
         },
  };
  
-static int __init flctl_nand_init(void)
-{
-       return platform_driver_probe(&flctl_driver, flctl_probe);
-}
-
-static void __exit flctl_nand_cleanup(void)
-{
-       platform_driver_unregister(&flctl_driver);
-}
-
-module_init(flctl_nand_init);
-module_exit(flctl_nand_cleanup);
+module_platform_driver_probe(flctl_driver, flctl_probe);
  
  MODULE_LICENSE("GPL");
  MODULE_AUTHOR("Yoshihiro Shimoda");
diff --git a/drivers/mtd/nand/sm_common.c b/drivers/mtd/nand/sm_common.c

index 082bcdcd6bcfa3f460e26b68edc183b56aba5947..e8181edebddd10923904c4db4be17adf79d29316 100644 (file)
--- a/drivers/mtd/nand/sm_common.c
+++ b/drivers/mtd/nand/sm_common.c
@@ -9,6 +9,7 @@
  #include <linux/kernel.h>
  #include <linux/mtd/nand.h>
  #include <linux/module.h>
+#include <linux/sizes.h>
  #include "sm_common.h"
  
  static struct nand_ecclayout nand_oob_sm = {
@@ -67,44 +68,37 @@ static int sm_block_markbad(struct mtd_info *mtd, loff_t ofs)
         return error;
  }
  
-
  static struct nand_flash_dev nand_smartmedia_flash_ids[] = {
-       {"SmartMedia 1MiB 5V",          0x6e, 256, 1, 0x1000, 0},
-       {"SmartMedia 1MiB 3,3V",        0xe8, 256, 1, 0x1000, 0},
-       {"SmartMedia 1MiB 3,3V",        0xec, 256, 1, 0x1000, 0},
-       {"SmartMedia 2MiB 3,3V",        0xea, 256, 2, 0x1000, 0},
-       {"SmartMedia 2MiB 5V",          0x64, 256, 2, 0x1000, 0},
-       {"SmartMedia 2MiB 3,3V ROM",    0x5d, 512, 2, 0x2000, NAND_ROM},
-       {"SmartMedia 4MiB 3,3V",        0xe3, 512, 4, 0x2000, 0},
-       {"SmartMedia 4MiB 3,3/5V",      0xe5, 512, 4, 0x2000, 0},
-       {"SmartMedia 4MiB 5V",          0x6b, 512, 4, 0x2000, 0},
-       {"SmartMedia 4MiB 3,3V ROM",    0xd5, 512, 4, 0x2000, NAND_ROM},
-       {"SmartMedia 8MiB 3,3V",        0xe6, 512, 8, 0x2000, 0},
-       {"SmartMedia 8MiB 3,3V ROM",    0xd6, 512, 8, 0x2000, NAND_ROM},
-       {"SmartMedia 16MiB 3,3V",       0x73, 512, 16, 0x4000, 0},
-       {"SmartMedia 16MiB 3,3V ROM",   0x57, 512, 16, 0x4000, NAND_ROM},
-       {"SmartMedia 32MiB 3,3V",       0x75, 512, 32, 0x4000, 0},
-       {"SmartMedia 32MiB 3,3V ROM",   0x58, 512, 32, 0x4000, NAND_ROM},
-       {"SmartMedia 64MiB 3,3V",       0x76, 512, 64, 0x4000, 0},
-       {"SmartMedia 64MiB 3,3V ROM",   0xd9, 512, 64, 0x4000, NAND_ROM},
-       {"SmartMedia 128MiB 3,3V",      0x79, 512, 128, 0x4000, 0},
-       {"SmartMedia 128MiB 3,3V ROM",  0xda, 512, 128, 0x4000, NAND_ROM},
-       {"SmartMedia 256MiB 3,3V",      0x71, 512, 256, 0x4000 },
-       {"SmartMedia 256MiB 3,3V ROM",  0x5b, 512, 256, 0x4000, NAND_ROM},
-       {NULL,}
+       LEGACY_ID_NAND("SmartMedia 2MiB 3,3V ROM",   0x5d, 2,   SZ_8K, NAND_ROM),
+       LEGACY_ID_NAND("SmartMedia 4MiB 3,3V",       0xe3, 4,   SZ_8K, 0),
+       LEGACY_ID_NAND("SmartMedia 4MiB 3,3/5V",     0xe5, 4,   SZ_8K, 0),
+       LEGACY_ID_NAND("SmartMedia 4MiB 5V",         0x6b, 4,   SZ_8K, 0),
+       LEGACY_ID_NAND("SmartMedia 4MiB 3,3V ROM",   0xd5, 4,   SZ_8K, NAND_ROM),
+       LEGACY_ID_NAND("SmartMedia 8MiB 3,3V",       0xe6, 8,   SZ_8K, 0),
+       LEGACY_ID_NAND("SmartMedia 8MiB 3,3V ROM",   0xd6, 8,   SZ_8K, NAND_ROM),
+       LEGACY_ID_NAND("SmartMedia 16MiB 3,3V",      0x73, 16,  SZ_16K, 0),
+       LEGACY_ID_NAND("SmartMedia 16MiB 3,3V ROM",  0x57, 16,  SZ_16K, NAND_ROM),
+       LEGACY_ID_NAND("SmartMedia 32MiB 3,3V",      0x75, 32,  SZ_16K, 0),
+       LEGACY_ID_NAND("SmartMedia 32MiB 3,3V ROM",  0x58, 32,  SZ_16K, NAND_ROM),
+       LEGACY_ID_NAND("SmartMedia 64MiB 3,3V",      0x76, 64,  SZ_16K, 0),
+       LEGACY_ID_NAND("SmartMedia 64MiB 3,3V ROM",  0xd9, 64,  SZ_16K, NAND_ROM),
+       LEGACY_ID_NAND("SmartMedia 128MiB 3,3V",     0x79, 128, SZ_16K, 0),
+       LEGACY_ID_NAND("SmartMedia 128MiB 3,3V ROM", 0xda, 128, SZ_16K, NAND_ROM),
+       LEGACY_ID_NAND("SmartMedia 256MiB 3, 3V",    0x71, 256, SZ_16K, 0),
+       LEGACY_ID_NAND("SmartMedia 256MiB 3,3V ROM", 0x5b, 256, SZ_16K, NAND_ROM),
+       {NULL}
  };
  
  static struct nand_flash_dev nand_xd_flash_ids[] = {
-
-       {"xD 16MiB 3,3V",    0x73, 512, 16, 0x4000, 0},
-       {"xD 32MiB 3,3V",    0x75, 512, 32, 0x4000, 0},
-       {"xD 64MiB 3,3V",    0x76, 512, 64, 0x4000, 0},
-       {"xD 128MiB 3,3V",   0x79, 512, 128, 0x4000, 0},
-       {"xD 256MiB 3,3V",   0x71, 512, 256, 0x4000, NAND_BROKEN_XD},
-       {"xD 512MiB 3,3V",   0xdc, 512, 512, 0x4000, NAND_BROKEN_XD},
-       {"xD 1GiB 3,3V",     0xd3, 512, 1024, 0x4000, NAND_BROKEN_XD},
-       {"xD 2GiB 3,3V",     0xd5, 512, 2048, 0x4000, NAND_BROKEN_XD},
-       {NULL,}
+       LEGACY_ID_NAND("xD 16MiB 3,3V",  0x73, 16,   SZ_16K, 0),
+       LEGACY_ID_NAND("xD 32MiB 3,3V",  0x75, 32,   SZ_16K, 0),
+       LEGACY_ID_NAND("xD 64MiB 3,3V",  0x76, 64,   SZ_16K, 0),
+       LEGACY_ID_NAND("xD 128MiB 3,3V", 0x79, 128,  SZ_16K, 0),
+       LEGACY_ID_NAND("xD 256MiB 3,3V", 0x71, 256,  SZ_16K, NAND_BROKEN_XD),
+       LEGACY_ID_NAND("xD 512MiB 3,3V", 0xdc, 512,  SZ_16K, NAND_BROKEN_XD),
+       LEGACY_ID_NAND("xD 1GiB 3,3V",   0xd3, 1024, SZ_16K, NAND_BROKEN_XD),
+       LEGACY_ID_NAND("xD 2GiB 3,3V",   0xd5, 2048, SZ_16K, NAND_BROKEN_XD),
+       {NULL}
  };
  
  int sm_register_device(struct mtd_info *mtd, int smartmedia)
diff --git a/drivers/mtd/nand/txx9ndfmc.c b/drivers/mtd/nand/txx9ndfmc.c

index e1e8748aa47b263c2f4349b49e5e0f715236d0f5..7ed654c68b0867af79c82da6f65cc210fb827498 100644 (file)
--- a/drivers/mtd/nand/txx9ndfmc.c
+++ b/drivers/mtd/nand/txx9ndfmc.c
@@ -427,18 +427,7 @@ static struct platform_driver txx9ndfmc_driver = {
         },
  };
  
-static int __init txx9ndfmc_init(void)
-{
-       return platform_driver_probe(&txx9ndfmc_driver, txx9ndfmc_probe);
-}
-
-static void __exit txx9ndfmc_exit(void)
-{
-       platform_driver_unregister(&txx9ndfmc_driver);
-}
-
-module_init(txx9ndfmc_init);
-module_exit(txx9ndfmc_exit);
+module_platform_driver_probe(txx9ndfmc_driver, txx9ndfmc_probe);
  
  MODULE_LICENSE("GPL");
  MODULE_DESCRIPTION("TXx9 SoC NAND flash controller driver");
diff --git a/drivers/mtd/ofpart.c b/drivers/mtd/ofpart.c

index 30bd907a260afe19420fd84b0992018faee82e3a..553d6d6d560322c4bcafa44a184e6c09e864b621 100644 (file)
--- a/drivers/mtd/ofpart.c
+++ b/drivers/mtd/ofpart.c
@@ -55,6 +55,7 @@ static int parse_ofpart_partitions(struct mtd_info *master,
         while ((pp = of_get_next_child(node, pp))) {
                 const __be32 *reg;
                 int len;
+               int a_cells, s_cells;
  
                 reg = of_get_property(pp, "reg", &len);
                 if (!reg) {
@@ -62,8 +63,10 @@ static int parse_ofpart_partitions(struct mtd_info *master,
                         continue;
                 }
  
-               (*pparts)[i].offset = be32_to_cpu(reg[0]);
-               (*pparts)[i].size = be32_to_cpu(reg[1]);
+               a_cells = of_n_addr_cells(pp);
+               s_cells = of_n_size_cells(pp);
+               (*pparts)[i].offset = of_read_number(reg, a_cells);
+               (*pparts)[i].size = of_read_number(reg + a_cells, s_cells);
  
                 partname = of_get_property(pp, "label", &len);
                 if (!partname)
diff --git a/drivers/mtd/onenand/Kconfig b/drivers/mtd/onenand/Kconfig

index 91467bb036341fe52f84a5db00c7664365d0abb9..ab2607273e809e32f62fde7654e794a5a5b252bb 100644 (file)
--- a/drivers/mtd/onenand/Kconfig
+++ b/drivers/mtd/onenand/Kconfig
@@ -40,7 +40,6 @@ config MTD_ONENAND_SAMSUNG
  
  config MTD_ONENAND_OTP
         bool "OneNAND OTP Support"
-       select HAVE_MTD_OTP
         help
           One Block of the NAND Flash Array memory is reserved as
           a One-Time Programmable Block memory area.
@@ -68,10 +67,4 @@ config MTD_ONENAND_2X_PROGRAM
  
           And more recent chips
  
-config MTD_ONENAND_SIM
-       tristate "OneNAND simulator support"
-       help
-         The simulator may simulate various OneNAND flash chips for the
-         OneNAND MTD layer.
-
  endif # MTD_ONENAND
diff --git a/drivers/mtd/onenand/Makefile b/drivers/mtd/onenand/Makefile

index 2b7884c7577e7cda2e789da09ea3514969658fc5..9d6540e8b3d22ae0fe3d04422c62cdbf2f2b54d1 100644 (file)
--- a/drivers/mtd/onenand/Makefile
+++ b/drivers/mtd/onenand/Makefile
@@ -10,7 +10,4 @@ obj-$(CONFIG_MTD_ONENAND_GENERIC)     += generic.o
  obj-$(CONFIG_MTD_ONENAND_OMAP2)                += omap2.o
  obj-$(CONFIG_MTD_ONENAND_SAMSUNG)       += samsung.o
  
-# Simulator
-obj-$(CONFIG_MTD_ONENAND_SIM)          += onenand_sim.o
-
  onenand-objs = onenand_base.o onenand_bbt.o
diff --git a/drivers/mtd/onenand/omap2.c b/drivers/mtd/onenand/omap2.c

index eec2aedb4ab83767587cd4f38f28d8632632478a..d98b198edd53a27139c1eaa0ffad5ae5f0fac9f2 100644 (file)
--- a/drivers/mtd/onenand/omap2.c
+++ b/drivers/mtd/onenand/omap2.c
@@ -832,19 +832,7 @@ static struct platform_driver omap2_onenand_driver = {
         },
  };
  
-static int __init omap2_onenand_init(void)
-{
-       printk(KERN_INFO "OneNAND driver initializing\n");
-       return platform_driver_register(&omap2_onenand_driver);
-}
-
-static void __exit omap2_onenand_exit(void)
-{
-       platform_driver_unregister(&omap2_onenand_driver);
-}
-
-module_init(omap2_onenand_init);
-module_exit(omap2_onenand_exit);
+module_platform_driver(omap2_onenand_driver);
  
  MODULE_ALIAS("platform:" DRIVER_NAME);
  MODULE_LICENSE("GPL");
diff --git a/drivers/mtd/onenand/onenand_sim.c b/drivers/mtd/onenand/onenand_sim.c

deleted file mode 100644 (file)

index 85399e3..0000000
--- a/drivers/mtd/onenand/onenand_sim.c
+++ /dev/null
@@ -1,564 +0,0 @@
-/*
- *  linux/drivers/mtd/onenand/onenand_sim.c
- *
- *  The OneNAND simulator
- *
- *  Copyright © 2005-2007 Samsung Electronics
- *  Kyungmin Park <kyungmin.park@samsung.com>
- *
- *  Vishak G <vishak.g at samsung.com>, Rohit Hagargundgi <h.rohit at samsung.com>
- *  Flex-OneNAND simulator support
- *  Copyright (C) Samsung Electronics, 2008
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License version 2 as
- * published by the Free Software Foundation.
- */
-
-#include <linux/kernel.h>
-#include <linux/slab.h>
-#include <linux/module.h>
-#include <linux/init.h>
-#include <linux/vmalloc.h>
-#include <linux/mtd/mtd.h>
-#include <linux/mtd/partitions.h>
-#include <linux/mtd/onenand.h>
-
-#include <linux/io.h>
-
-#ifndef CONFIG_ONENAND_SIM_MANUFACTURER
-#define CONFIG_ONENAND_SIM_MANUFACTURER         0xec
-#endif
-
-#ifndef CONFIG_ONENAND_SIM_DEVICE_ID
-#define CONFIG_ONENAND_SIM_DEVICE_ID            0x04
-#endif
-
-#define CONFIG_FLEXONENAND ((CONFIG_ONENAND_SIM_DEVICE_ID >> 9) & 1)
-
-#ifndef CONFIG_ONENAND_SIM_VERSION_ID
-#define CONFIG_ONENAND_SIM_VERSION_ID           0x1e
-#endif
-
-#ifndef CONFIG_ONENAND_SIM_TECHNOLOGY_ID
-#define CONFIG_ONENAND_SIM_TECHNOLOGY_ID CONFIG_FLEXONENAND
-#endif
-
-/* Initial boundary values for Flex-OneNAND Simulator */
-#ifndef CONFIG_FLEXONENAND_SIM_DIE0_BOUNDARY
-#define CONFIG_FLEXONENAND_SIM_DIE0_BOUNDARY   0x01
-#endif
-
-#ifndef CONFIG_FLEXONENAND_SIM_DIE1_BOUNDARY
-#define CONFIG_FLEXONENAND_SIM_DIE1_BOUNDARY   0x01
-#endif
-
-static int manuf_id    = CONFIG_ONENAND_SIM_MANUFACTURER;
-static int device_id   = CONFIG_ONENAND_SIM_DEVICE_ID;
-static int version_id  = CONFIG_ONENAND_SIM_VERSION_ID;
-static int technology_id = CONFIG_ONENAND_SIM_TECHNOLOGY_ID;
-static int boundary[] = {
-       CONFIG_FLEXONENAND_SIM_DIE0_BOUNDARY,
-       CONFIG_FLEXONENAND_SIM_DIE1_BOUNDARY,
-};
-
-struct onenand_flash {
-       void __iomem *base;
-       void __iomem *data;
-};
-
-#define ONENAND_CORE(flash)            (flash->data)
-#define ONENAND_CORE_SPARE(flash, this, offset)                                \
-       ((flash->data) + (this->chipsize) + (offset >> 5))
-
-#define ONENAND_MAIN_AREA(this, offset)                                        \
-       (this->base + ONENAND_DATARAM + offset)
-
-#define ONENAND_SPARE_AREA(this, offset)                               \
-       (this->base + ONENAND_SPARERAM + offset)
-
-#define ONENAND_GET_WP_STATUS(this)                                    \
-       (readw(this->base + ONENAND_REG_WP_STATUS))
-
-#define ONENAND_SET_WP_STATUS(v, this)                                 \
-       (writew(v, this->base + ONENAND_REG_WP_STATUS))
-
-/* It has all 0xff chars */
-#define MAX_ONENAND_PAGESIZE           (4096 + 128)
-static unsigned char *ffchars;
-
-#if CONFIG_FLEXONENAND
-#define PARTITION_NAME "Flex-OneNAND simulator partition"
-#else
-#define PARTITION_NAME "OneNAND simulator partition"
-#endif
-
-static struct mtd_partition os_partitions[] = {
-       {
-               .name           = PARTITION_NAME,
-               .offset         = 0,
-               .size           = MTDPART_SIZ_FULL,
-       },
-};
-
-/*
- * OneNAND simulator mtd
- */
-struct onenand_info {
-       struct mtd_info         mtd;
-       struct mtd_partition    *parts;
-       struct onenand_chip     onenand;
-       struct onenand_flash    flash;
-};
-
-static struct onenand_info *info;
-
-#define DPRINTK(format, args...)                                       \
-do {                                                                   \
-       printk(KERN_DEBUG "%s[%d]: " format "\n", __func__,             \
-                          __LINE__, ##args);                           \
-} while (0)
-
-/**
- * onenand_lock_handle - Handle Lock scheme
- * @this:              OneNAND device structure
- * @cmd:               The command to be sent
- *
- * Send lock command to OneNAND device.
- * The lock scheme depends on chip type.
- */
-static void onenand_lock_handle(struct onenand_chip *this, int cmd)
-{
-       int block_lock_scheme;
-       int status;
-
-       status = ONENAND_GET_WP_STATUS(this);
-       block_lock_scheme = !(this->options & ONENAND_HAS_CONT_LOCK);
-
-       switch (cmd) {
-       case ONENAND_CMD_UNLOCK:
-       case ONENAND_CMD_UNLOCK_ALL:
-               if (block_lock_scheme)
-                       ONENAND_SET_WP_STATUS(ONENAND_WP_US, this);
-               else
-                       ONENAND_SET_WP_STATUS(status | ONENAND_WP_US, this);
-               break;
-
-       case ONENAND_CMD_LOCK:
-               if (block_lock_scheme)
-                       ONENAND_SET_WP_STATUS(ONENAND_WP_LS, this);
-               else
-                       ONENAND_SET_WP_STATUS(status | ONENAND_WP_LS, this);
-               break;
-
-       case ONENAND_CMD_LOCK_TIGHT:
-               if (block_lock_scheme)
-                       ONENAND_SET_WP_STATUS(ONENAND_WP_LTS, this);
-               else
-                       ONENAND_SET_WP_STATUS(status | ONENAND_WP_LTS, this);
-               break;
-
-       default:
-               break;
-       }
-}
-
-/**
- * onenand_bootram_handle - Handle BootRAM area
- * @this:              OneNAND device structure
- * @cmd:               The command to be sent
- *
- * Emulate BootRAM area. It is possible to do basic operation using BootRAM.
- */
-static void onenand_bootram_handle(struct onenand_chip *this, int cmd)
-{
-       switch (cmd) {
-       case ONENAND_CMD_READID:
-               writew(manuf_id, this->base);
-               writew(device_id, this->base + 2);
-               writew(version_id, this->base + 4);
-               break;
-
-       default:
-               /* REVIST: Handle other commands */
-               break;
-       }
-}
-
-/**
- * onenand_update_interrupt - Set interrupt register
- * @this:         OneNAND device structure
- * @cmd:          The command to be sent
- *
- * Update interrupt register. The status depends on command.
- */
-static void onenand_update_interrupt(struct onenand_chip *this, int cmd)
-{
-       int interrupt = ONENAND_INT_MASTER;
-
-       switch (cmd) {
-       case ONENAND_CMD_READ:
-       case ONENAND_CMD_READOOB:
-               interrupt |= ONENAND_INT_READ;
-               break;
-
-       case ONENAND_CMD_PROG:
-       case ONENAND_CMD_PROGOOB:
-               interrupt |= ONENAND_INT_WRITE;
-               break;
-
-       case ONENAND_CMD_ERASE:
-               interrupt |= ONENAND_INT_ERASE;
-               break;
-
-       case ONENAND_CMD_RESET:
-               interrupt |= ONENAND_INT_RESET;
-               break;
-
-       default:
-               break;
-       }
-
-       writew(interrupt, this->base + ONENAND_REG_INTERRUPT);
-}
-
-/**
- * onenand_check_overwrite - Check if over-write happened
- * @dest:              The destination pointer
- * @src:               The source pointer
- * @count:             The length to be check
- *
- * Returns:            0 on same, otherwise 1
- *
- * Compare the source with destination
- */
-static int onenand_check_overwrite(void *dest, void *src, size_t count)
-{
-       unsigned int *s = (unsigned int *) src;
-       unsigned int *d = (unsigned int *) dest;
-       int i;
-
-       count >>= 2;
-       for (i = 0; i < count; i++)
-               if ((*s++ ^ *d++) != 0)
-                       return 1;
-
-       return 0;
-}
-
-/**
- * onenand_data_handle - Handle OneNAND Core and DataRAM
- * @this:              OneNAND device structure
- * @cmd:               The command to be sent
- * @dataram:           Which dataram used
- * @offset:            The offset to OneNAND Core
- *
- * Copy data from OneNAND Core to DataRAM (read)
- * Copy data from DataRAM to OneNAND Core (write)
- * Erase the OneNAND Core (erase)
- */
-static void onenand_data_handle(struct onenand_chip *this, int cmd,
-                               int dataram, unsigned int offset)
-{
-       struct mtd_info *mtd = &info->mtd;
-       struct onenand_flash *flash = this->priv;
-       int main_offset, spare_offset, die = 0;
-       void __iomem *src;
-       void __iomem *dest;
-       unsigned int i;
-       static int pi_operation;
-       int erasesize, rgn;
-
-       if (dataram) {
-               main_offset = mtd->writesize;
-               spare_offset = mtd->oobsize;
-       } else {
-               main_offset = 0;
-               spare_offset = 0;
-       }
-
-       if (pi_operation) {
-               die = readw(this->base + ONENAND_REG_START_ADDRESS2);
-               die >>= ONENAND_DDP_SHIFT;
-       }
-
-       switch (cmd) {
-       case FLEXONENAND_CMD_PI_ACCESS:
-               pi_operation = 1;
-               break;
-
-       case ONENAND_CMD_RESET:
-               pi_operation = 0;
-               break;
-
-       case ONENAND_CMD_READ:
-               src = ONENAND_CORE(flash) + offset;
-               dest = ONENAND_MAIN_AREA(this, main_offset);
-               if (pi_operation) {
-                       writew(boundary[die], this->base + ONENAND_DATARAM);
-                       break;
-               }
-               memcpy(dest, src, mtd->writesize);
-               /* Fall through */
-
-       case ONENAND_CMD_READOOB:
-               src = ONENAND_CORE_SPARE(flash, this, offset);
-               dest = ONENAND_SPARE_AREA(this, spare_offset);
-               memcpy(dest, src, mtd->oobsize);
-               break;
-
-       case ONENAND_CMD_PROG:
-               src = ONENAND_MAIN_AREA(this, main_offset);
-               dest = ONENAND_CORE(flash) + offset;
-               if (pi_operation) {
-                       boundary[die] = readw(this->base + ONENAND_DATARAM);
-                       break;
-               }
-               /* To handle partial write */
-               for (i = 0; i < (1 << mtd->subpage_sft); i++) {
-                       int off = i * this->subpagesize;
-                       if (!memcmp(src + off, ffchars, this->subpagesize))
-                               continue;
-                       if (memcmp(dest + off, ffchars, this->subpagesize) &&
-                           onenand_check_overwrite(dest + off, src + off, this->subpagesize))
-                               printk(KERN_ERR "over-write happened at 0x%08x\n", offset);
-                       memcpy(dest + off, src + off, this->subpagesize);
-               }
-               /* Fall through */
-
-       case ONENAND_CMD_PROGOOB:
-               src = ONENAND_SPARE_AREA(this, spare_offset);
-               /* Check all data is 0xff chars */
-               if (!memcmp(src, ffchars, mtd->oobsize))
-                       break;
-
-               dest = ONENAND_CORE_SPARE(flash, this, offset);
-               if (memcmp(dest, ffchars, mtd->oobsize) &&
-                   onenand_check_overwrite(dest, src, mtd->oobsize))
-                       printk(KERN_ERR "OOB: over-write happened at 0x%08x\n",
-                              offset);
-               memcpy(dest, src, mtd->oobsize);
-               break;
-
-       case ONENAND_CMD_ERASE:
-               if (pi_operation)
-                       break;
-
-               if (FLEXONENAND(this)) {
-                       rgn = flexonenand_region(mtd, offset);
-                       erasesize = mtd->eraseregions[rgn].erasesize;
-               } else
-                       erasesize = mtd->erasesize;
-
-               memset(ONENAND_CORE(flash) + offset, 0xff, erasesize);
-               memset(ONENAND_CORE_SPARE(flash, this, offset), 0xff,
-                      (erasesize >> 5));
-               break;
-
-       default:
-               break;
-       }
-}
-
-/**
- * onenand_command_handle - Handle command
- * @this:              OneNAND device structure
- * @cmd:               The command to be sent
- *
- * Emulate OneNAND command.
- */
-static void onenand_command_handle(struct onenand_chip *this, int cmd)
-{
-       unsigned long offset = 0;
-       int block = -1, page = -1, bufferram = -1;
-       int dataram = 0;
-
-       switch (cmd) {
-       case ONENAND_CMD_UNLOCK:
-       case ONENAND_CMD_LOCK:
-       case ONENAND_CMD_LOCK_TIGHT:
-       case ONENAND_CMD_UNLOCK_ALL:
-               onenand_lock_handle(this, cmd);
-               break;
-
-       case ONENAND_CMD_BUFFERRAM:
-               /* Do nothing */
-               return;
-
-       default:
-               block = (int) readw(this->base + ONENAND_REG_START_ADDRESS1);
-               if (block & (1 << ONENAND_DDP_SHIFT)) {
-                       block &= ~(1 << ONENAND_DDP_SHIFT);
-                       /* The half of chip block */
-                       block += this->chipsize >> (this->erase_shift + 1);
-               }
-               if (cmd == ONENAND_CMD_ERASE)
-                       break;
-
-               page = (int) readw(this->base + ONENAND_REG_START_ADDRESS8);
-               page = (page >> ONENAND_FPA_SHIFT);
-               bufferram = (int) readw(this->base + ONENAND_REG_START_BUFFER);
-               bufferram >>= ONENAND_BSA_SHIFT;
-               bufferram &= ONENAND_BSA_DATARAM1;
-               dataram = (bufferram == ONENAND_BSA_DATARAM1) ? 1 : 0;
-               break;
-       }
-
-       if (block != -1)
-               offset = onenand_addr(this, block);
-
-       if (page != -1)
-               offset += page << this->page_shift;
-
-       onenand_data_handle(this, cmd, dataram, offset);
-
-       onenand_update_interrupt(this, cmd);
-}
-
-/**
- * onenand_writew - [OneNAND Interface] Emulate write operation
- * @value:             value to write
- * @addr:              address to write
- *
- * Write OneNAND register with value
- */
-static void onenand_writew(unsigned short value, void __iomem * addr)
-{
-       struct onenand_chip *this = info->mtd.priv;
-
-       /* BootRAM handling */
-       if (addr < this->base + ONENAND_DATARAM) {
-               onenand_bootram_handle(this, value);
-               return;
-       }
-       /* Command handling */
-       if (addr == this->base + ONENAND_REG_COMMAND)
-               onenand_command_handle(this, value);
-
-       writew(value, addr);
-}
-
-/**
- * flash_init - Initialize OneNAND simulator
- * @flash:             OneNAND simulator data strucutres
- *
- * Initialize OneNAND simulator.
- */
-static int __init flash_init(struct onenand_flash *flash)
-{
-       int density, size;
-       int buffer_size;
-
-       flash->base = kzalloc(131072, GFP_KERNEL);
-       if (!flash->base) {
-               printk(KERN_ERR "Unable to allocate base address.\n");
-               return -ENOMEM;
-       }
-
-       density = device_id >> ONENAND_DEVICE_DENSITY_SHIFT;
-       density &= ONENAND_DEVICE_DENSITY_MASK;
-       size = ((16 << 20) << density);
-
-       ONENAND_CORE(flash) = vmalloc(size + (size >> 5));
-       if (!ONENAND_CORE(flash)) {
-               printk(KERN_ERR "Unable to allocate nand core address.\n");
-               kfree(flash->base);
-               return -ENOMEM;
-       }
-
-       memset(ONENAND_CORE(flash), 0xff, size + (size >> 5));
-
-       /* Setup registers */
-       writew(manuf_id, flash->base + ONENAND_REG_MANUFACTURER_ID);
-       writew(device_id, flash->base + ONENAND_REG_DEVICE_ID);
-       writew(version_id, flash->base + ONENAND_REG_VERSION_ID);
-       writew(technology_id, flash->base + ONENAND_REG_TECHNOLOGY);
-
-       if (density < 2 && (!CONFIG_FLEXONENAND))
-               buffer_size = 0x0400;   /* 1KiB page */
-       else
-               buffer_size = 0x0800;   /* 2KiB page */
-       writew(buffer_size, flash->base + ONENAND_REG_DATA_BUFFER_SIZE);
-
-       return 0;
-}
-
-/**
- * flash_exit - Clean up OneNAND simulator
- * @flash:             OneNAND simulator data structures
- *
- * Clean up OneNAND simulator.
- */
-static void flash_exit(struct onenand_flash *flash)
-{
-       vfree(ONENAND_CORE(flash));
-       kfree(flash->base);
-}
-
-static int __init onenand_sim_init(void)
-{
-       /* Allocate all 0xff chars pointer */
-       ffchars = kmalloc(MAX_ONENAND_PAGESIZE, GFP_KERNEL);
-       if (!ffchars) {
-               printk(KERN_ERR "Unable to allocate ff chars.\n");
-               return -ENOMEM;
-       }
-       memset(ffchars, 0xff, MAX_ONENAND_PAGESIZE);
-
-       /* Allocate OneNAND simulator mtd pointer */
-       info = kzalloc(sizeof(struct onenand_info), GFP_KERNEL);
-       if (!info) {
-               printk(KERN_ERR "Unable to allocate core structures.\n");
-               kfree(ffchars);
-               return -ENOMEM;
-       }
-
-       /* Override write_word function */
-       info->onenand.write_word = onenand_writew;
-
-       if (flash_init(&info->flash)) {
-               printk(KERN_ERR "Unable to allocate flash.\n");
-               kfree(ffchars);
-               kfree(info);
-               return -ENOMEM;
-       }
-
-       info->parts = os_partitions;
-
-       info->onenand.base = info->flash.base;
-       info->onenand.priv = &info->flash;
-
-       info->mtd.name = "OneNAND simulator";
-       info->mtd.priv = &info->onenand;
-       info->mtd.owner = THIS_MODULE;
-
-       if (onenand_scan(&info->mtd, 1)) {
-               flash_exit(&info->flash);
-               kfree(ffchars);
-               kfree(info);
-               return -ENXIO;
-       }
-
-       mtd_device_register(&info->mtd, info->parts,
-                           ARRAY_SIZE(os_partitions));
-
-       return 0;
-}
-
-static void __exit onenand_sim_exit(void)
-{
-       struct onenand_chip *this = info->mtd.priv;
-       struct onenand_flash *flash = this->priv;
-
-       onenand_release(&info->mtd);
-       flash_exit(flash);
-       kfree(ffchars);
-       kfree(info);
-}
-
-module_init(onenand_sim_init);
-module_exit(onenand_sim_exit);
-
-MODULE_AUTHOR("Kyungmin Park <kyungmin.park@samsung.com>");
-MODULE_DESCRIPTION("The OneNAND flash simulator");
-MODULE_LICENSE("GPL");
diff --git a/drivers/mtd/sm_ftl.c b/drivers/mtd/sm_ftl.c

index 8dd6ba52404a390c57462bb90f25f470c1d32687..f9d5615c572747ee6137a89327aa974322fd949f 100644 (file)
--- a/drivers/mtd/sm_ftl.c
+++ b/drivers/mtd/sm_ftl.c
@@ -1107,7 +1107,7 @@ static int sm_flush(struct mtd_blktrans_dev *dev)
  }
  
  /* outside interface: device is released */
-static int sm_release(struct mtd_blktrans_dev *dev)
+static void sm_release(struct mtd_blktrans_dev *dev)
  {
         struct sm_ftl *ftl = dev->priv;
  
@@ -1116,7 +1116,6 @@ static int sm_release(struct mtd_blktrans_dev *dev)
         cancel_work_sync(&ftl->flush_work);
         sm_cache_flush(ftl);
         mutex_unlock(&ftl->mutex);
-       return 0;
  }
  
  /* outside interface: get geometry */
diff --git a/drivers/net/ethernet/adi/bfin_mac.c b/drivers/net/ethernet/adi/bfin_mac.c

index ee705771bd2cb43658fd55edffa5811a179b6c80..dada66bfe0d6e018778ba24939f15b82e9468b0d 100644 (file)
--- a/drivers/net/ethernet/adi/bfin_mac.c
+++ b/drivers/net/ethernet/adi/bfin_mac.c
@@ -1700,7 +1700,8 @@ static int bfin_mac_probe(struct platform_device *pdev)
         }
  
         bfin_mac_hwtstamp_init(ndev);
-       if (bfin_phc_init(ndev, &pdev->dev)) {
+       rc = bfin_phc_init(ndev, &pdev->dev);
+       if (rc) {
                 dev_err(&pdev->dev, "Cannot register PHC device!\n");
                 goto out_err_phc;
         }
diff --git a/drivers/net/ethernet/broadcom/cnic.c b/drivers/net/ethernet/broadcom/cnic.c

index 40649a8bf3901cb5d7e8a7037ad19aced910c55b..6b0dc131b20ea3fd057fdd8c1fa98243ecbdf42c 100644 (file)
--- a/drivers/net/ethernet/broadcom/cnic.c
+++ b/drivers/net/ethernet/broadcom/cnic.c
@@ -4085,7 +4085,7 @@ static int cnic_cm_alloc_mem(struct cnic_dev *dev)
         if (!cp->csk_tbl)
                 return -ENOMEM;
  
-       port_id = random32();
+       port_id = prandom_u32();
         port_id %= CNIC_LOCAL_PORT_RANGE;
         if (cnic_init_id_tbl(&cp->csk_port_tbl, CNIC_LOCAL_PORT_RANGE,
                              CNIC_LOCAL_PORT_MIN, port_id)) {
@@ -4145,7 +4145,7 @@ static int cnic_cm_init_bnx2_hw(struct cnic_dev *dev)
  {
         u32 seed;
  
-       seed = random32();
+       seed = prandom_u32();
         cnic_ctx_wr(dev, 45, 0, seed);
         return 0;
  }
diff --git a/drivers/net/ethernet/emulex/benet/be_cmds.c b/drivers/net/ethernet/emulex/benet/be_cmds.c

index e1e5bb9d90545d6a625d5476c873b942a2a37a04..fd7b547698abd89f829008e717e34f2fc9d9e2f9 100644 (file)
--- a/drivers/net/ethernet/emulex/benet/be_cmds.c
+++ b/drivers/net/ethernet/emulex/benet/be_cmds.c
@@ -2640,9 +2640,8 @@ int be_cmd_get_mac_from_list(struct be_adapter *adapter, u8 *mac,
         req = get_mac_list_cmd.va;
  
         be_wrb_cmd_hdr_prepare(&req->hdr, CMD_SUBSYSTEM_COMMON,
-                               OPCODE_COMMON_GET_MAC_LIST, sizeof(*req),
-                               wrb, &get_mac_list_cmd);
-
+                              OPCODE_COMMON_GET_MAC_LIST,
+                              get_mac_list_cmd.size, wrb, &get_mac_list_cmd);
         req->hdr.domain = domain;
         req->mac_type = MAC_ADDRESS_TYPE_NETWORK;
         req->perm_override = 1;
diff --git a/drivers/net/ethernet/emulex/benet/be_main.c b/drivers/net/ethernet/emulex/benet/be_main.c

index 6c52a60dcdb769b9fb52ea3ba8050dd413432102..a444110b060fd74361759be2f9f3a25a4f0fe095 100644 (file)
--- a/drivers/net/ethernet/emulex/benet/be_main.c
+++ b/drivers/net/ethernet/emulex/benet/be_main.c
@@ -1827,7 +1827,7 @@ static void be_rx_cq_clean(struct be_rx_obj *rxo)
                         mdelay(1);
                 } else {
                         be_rx_compl_discard(rxo, rxcp);
-                       be_cq_notify(adapter, rx_cq->id, true, 1);
+                       be_cq_notify(adapter, rx_cq->id, false, 1);
                         if (rxcp->num_rcvd == 0)
                                 break;
                 }
@@ -2533,11 +2533,6 @@ static void be_rx_qs_destroy(struct be_adapter *adapter)
                 q = &rxo->q;
                 if (q->created) {
                         be_cmd_rxq_destroy(adapter, q);
-                       /* After the rxq is invalidated, wait for a grace time
-                        * of 1ms for all dma to end and the flush compl to
-                        * arrive
-                        */
-                       mdelay(1);
                         be_rx_cq_clean(rxo);
                 }
                 be_queue_free(adapter, q);
@@ -2564,6 +2559,7 @@ static int be_close(struct net_device *netdev)
          * all tx skbs are freed.
          */
         be_tx_compl_clean(adapter);
+       netif_tx_disable(netdev);
  
         be_rx_qs_destroy(adapter);
  
@@ -2672,6 +2668,7 @@ static int be_open(struct net_device *netdev)
         if (!status)
                 be_link_status_update(adapter, link_status);
  
+       netif_tx_start_all_queues(netdev);
         be_roce_dev_open(adapter);
         return 0;
  err:
@@ -2783,6 +2780,8 @@ static void be_vf_clear(struct be_adapter *adapter)
                 goto done;
         }
  
+       pci_disable_sriov(adapter->pdev);
+
         for_all_vfs(adapter, vf_cfg, vf) {
                 if (lancer_chip(adapter))
                         be_cmd_set_mac_list(adapter, NULL, 0, vf + 1);
@@ -2792,7 +2791,6 @@ static void be_vf_clear(struct be_adapter *adapter)
  
                 be_cmd_if_destroy(adapter, vf_cfg->if_handle, vf + 1);
         }
-       pci_disable_sriov(adapter->pdev);
  done:
         kfree(adapter->vf_cfg);
         adapter->num_vfs = 0;
@@ -2889,13 +2887,8 @@ static int be_vf_setup(struct be_adapter *adapter)
                         dev_info(dev, "Device supports %d VFs and not %d\n",
                                  adapter->dev_num_vfs, num_vfs);
                 adapter->num_vfs = min_t(u16, num_vfs, adapter->dev_num_vfs);
-
-               status = pci_enable_sriov(adapter->pdev, num_vfs);
-               if (status) {
-                       dev_err(dev, "SRIOV enable failed\n");
-                       adapter->num_vfs = 0;
+               if (!adapter->num_vfs)
                         return 0;
-               }
         }
  
         status = be_vf_setup_init(adapter);
@@ -2944,6 +2937,15 @@ static int be_vf_setup(struct be_adapter *adapter)
  
                 be_cmd_enable_vf(adapter, vf + 1);
         }
+
+       if (!old_vfs) {
+               status = pci_enable_sriov(adapter->pdev, adapter->num_vfs);
+               if (status) {
+                       dev_err(dev, "SRIOV enable failed\n");
+                       adapter->num_vfs = 0;
+                       goto err;
+               }
+       }
         return 0;
  err:
         dev_err(dev, "VF setup failed\n");
@@ -3198,7 +3200,7 @@ static int be_setup(struct be_adapter *adapter)
                 be_cmd_set_flow_control(adapter, adapter->tx_fc,
                                         adapter->rx_fc);
  
-       if (be_physfn(adapter) && num_vfs) {
+       if (be_physfn(adapter)) {
                 if (adapter->dev_num_vfs)
                         be_vf_setup(adapter);
                 else
diff --git a/drivers/net/ethernet/freescale/fec.h b/drivers/net/ethernet/freescale/fec.h

index ceb4d43c132db30f4b82c09a2f100e1f6c98652b..9ce5b7185fda196f521d439a143b3021b56ff5c3 100644 (file)
--- a/drivers/net/ethernet/freescale/fec.h
+++ b/drivers/net/ethernet/freescale/fec.h
@@ -198,6 +198,11 @@ struct bufdesc_ex {
  #define FLAG_RX_CSUM_ENABLED   (BD_ENET_RX_ICE | BD_ENET_RX_PCR)
  #define FLAG_RX_CSUM_ERROR     (BD_ENET_RX_ICE | BD_ENET_RX_PCR)
  
+struct fec_enet_delayed_work {
+       struct delayed_work delay_work;
+       bool timeout;
+};
+
  /* The FEC buffer descriptors track the ring buffers.  The rx_bd_base and
   * tx_bd_base always point to the base of the buffer descriptors.  The
   * cur_rx and cur_tx point to the currently available buffer.
@@ -232,9 +237,6 @@ struct fec_enet_private {
         /* The ring entries to be free()ed */
         struct bufdesc  *dirty_tx;
  
-       /* hold while accessing the HW like ringbuffer for tx/rx but not MAC */
-       spinlock_t hw_lock;
-
         struct  platform_device *pdev;
  
         int     opened;
@@ -269,7 +271,7 @@ struct fec_enet_private {
         int hwts_rx_en;
         int hwts_tx_en;
         struct timer_list time_keep;
-
+       struct fec_enet_delayed_work delay_work;
  };
  
  void fec_ptp_init(struct net_device *ndev, struct platform_device *pdev);
diff --git a/drivers/net/ethernet/freescale/fec_main.c b/drivers/net/ethernet/freescale/fec_main.c

index e25bf832e6b3e1f5695f7075537cef6d711c3706..aff0310a778bf7afb545c4b53f995770f7f03ec2 100644 (file)
--- a/drivers/net/ethernet/freescale/fec_main.c
+++ b/drivers/net/ethernet/freescale/fec_main.c
@@ -445,6 +445,13 @@ fec_restart(struct net_device *ndev, int duplex)
         u32 rcntl = OPT_FRAME_SIZE | 0x04;
         u32 ecntl = 0x2; /* ETHEREN */
  
+       if (netif_running(ndev)) {
+               netif_device_detach(ndev);
+               napi_disable(&fep->napi);
+               netif_stop_queue(ndev);
+               netif_tx_lock(ndev);
+       }
+
         /* Whack a reset.  We should wait for this. */
         writel(1, fep->hwp + FEC_ECNTRL);
         udelay(10);
@@ -605,6 +612,13 @@ fec_restart(struct net_device *ndev, int duplex)
  
         /* Enable interrupts we wish to service */
         writel(FEC_DEFAULT_IMASK, fep->hwp + FEC_IMASK);
+
+       if (netif_running(ndev)) {
+               netif_device_attach(ndev);
+               napi_enable(&fep->napi);
+               netif_wake_queue(ndev);
+               netif_tx_unlock(ndev);
+       }
  }
  
  static void
@@ -644,8 +658,22 @@ fec_timeout(struct net_device *ndev)
  
         ndev->stats.tx_errors++;
  
-       fec_restart(ndev, fep->full_duplex);
-       netif_wake_queue(ndev);
+       fep->delay_work.timeout = true;
+       schedule_delayed_work(&(fep->delay_work.delay_work), 0);
+}
+
+static void fec_enet_work(struct work_struct *work)
+{
+       struct fec_enet_private *fep =
+               container_of(work,
+                            struct fec_enet_private,
+                            delay_work.delay_work.work);
+
+       if (fep->delay_work.timeout) {
+               fep->delay_work.timeout = false;
+               fec_restart(fep->netdev, fep->full_duplex);
+               netif_wake_queue(fep->netdev);
+       }
  }
  
  static void
@@ -1024,16 +1052,12 @@ static void fec_enet_adjust_link(struct net_device *ndev)
  {
         struct fec_enet_private *fep = netdev_priv(ndev);
         struct phy_device *phy_dev = fep->phy_dev;
-       unsigned long flags;
-
         int status_change = 0;
  
-       spin_lock_irqsave(&fep->hw_lock, flags);
-
         /* Prevent a state halted on mii error */
         if (fep->mii_timeout && phy_dev->state == PHY_HALTED) {
                 phy_dev->state = PHY_RESUMING;
-               goto spin_unlock;
+               return;
         }
  
         if (phy_dev->link) {
@@ -1061,9 +1085,6 @@ static void fec_enet_adjust_link(struct net_device *ndev)
                 }
         }
  
-spin_unlock:
-       spin_unlock_irqrestore(&fep->hw_lock, flags);
-
         if (status_change)
                 phy_print_status(phy_dev);
  }
@@ -1732,7 +1753,6 @@ static int fec_enet_init(struct net_device *ndev)
                 return -ENOMEM;
  
         memset(cbd_base, 0, PAGE_SIZE);
-       spin_lock_init(&fep->hw_lock);
  
         fep->netdev = ndev;
  
@@ -1952,6 +1972,7 @@ fec_probe(struct platform_device *pdev)
         if (fep->bufdesc_ex && fep->ptp_clock)
                 netdev_info(ndev, "registered PHC device %d\n", fep->dev_id);
  
+       INIT_DELAYED_WORK(&(fep->delay_work.delay_work), fec_enet_work);
         return 0;
  
  failed_register:
@@ -1984,6 +2005,7 @@ fec_drv_remove(struct platform_device *pdev)
         struct fec_enet_private *fep = netdev_priv(ndev);
         int i;
  
+       cancel_delayed_work_sync(&(fep->delay_work.delay_work));
         unregister_netdev(ndev);
         fec_enet_mii_remove(fep);
         del_timer_sync(&fep->time_keep);
diff --git a/drivers/net/ethernet/mellanox/mlx4/en_ethtool.c b/drivers/net/ethernet/mellanox/mlx4/en_ethtool.c

index bcf4d118e98c9228b32ac196136b15fa515c7ace..c9e6b62dd000955565fb0186d334221a834b0a40 100644 (file)
--- a/drivers/net/ethernet/mellanox/mlx4/en_ethtool.c
+++ b/drivers/net/ethernet/mellanox/mlx4/en_ethtool.c
@@ -889,7 +889,7 @@ static int mlx4_en_flow_replace(struct net_device *dev,
                 .queue_mode = MLX4_NET_TRANS_Q_FIFO,
                 .exclusive = 0,
                 .allow_loopback = 1,
-               .promisc_mode = MLX4_FS_PROMISC_NONE,
+               .promisc_mode = MLX4_FS_REGULAR,
         };
  
         rule.port = priv->port;
diff --git a/drivers/net/ethernet/mellanox/mlx4/en_netdev.c b/drivers/net/ethernet/mellanox/mlx4/en_netdev.c

index a69a908614e69b0760630204a2c7ba8518390ade..b35f9470009363bad560f7142322436baa76d8a3 100644 (file)
--- a/drivers/net/ethernet/mellanox/mlx4/en_netdev.c
+++ b/drivers/net/ethernet/mellanox/mlx4/en_netdev.c
@@ -127,7 +127,7 @@ static void mlx4_en_filter_work(struct work_struct *work)
                 .queue_mode = MLX4_NET_TRANS_Q_LIFO,
                 .exclusive = 1,
                 .allow_loopback = 1,
-               .promisc_mode = MLX4_FS_PROMISC_NONE,
+               .promisc_mode = MLX4_FS_REGULAR,
                 .port = priv->port,
                 .priority = MLX4_DOMAIN_RFS,
         };
@@ -448,7 +448,7 @@ static int mlx4_en_uc_steer_add(struct mlx4_en_priv *priv,
                         .queue_mode = MLX4_NET_TRANS_Q_FIFO,
                         .exclusive = 0,
                         .allow_loopback = 1,
-                       .promisc_mode = MLX4_FS_PROMISC_NONE,
+                       .promisc_mode = MLX4_FS_REGULAR,
                         .priority = MLX4_DOMAIN_NIC,
                 };
  
@@ -795,7 +795,7 @@ static void mlx4_en_set_promisc_mode(struct mlx4_en_priv *priv,
                         err = mlx4_flow_steer_promisc_add(mdev->dev,
                                                           priv->port,
                                                           priv->base_qpn,
-                                                         MLX4_FS_PROMISC_UPLINK);
+                                                         MLX4_FS_ALL_DEFAULT);
                         if (err)
                                 en_err(priv, "Failed enabling promiscuous mode\n");
                         priv->flags |= MLX4_EN_FLAG_MC_PROMISC;
@@ -858,7 +858,7 @@ static void mlx4_en_clear_promisc_mode(struct mlx4_en_priv *priv,
         case MLX4_STEERING_MODE_DEVICE_MANAGED:
                 err = mlx4_flow_steer_promisc_remove(mdev->dev,
                                                      priv->port,
-                                                    MLX4_FS_PROMISC_UPLINK);
+                                                    MLX4_FS_ALL_DEFAULT);
                 if (err)
                         en_err(priv, "Failed disabling promiscuous mode\n");
                 priv->flags &= ~MLX4_EN_FLAG_MC_PROMISC;
@@ -919,7 +919,7 @@ static void mlx4_en_do_multicast(struct mlx4_en_priv *priv,
                                 err = mlx4_flow_steer_promisc_add(mdev->dev,
                                                                   priv->port,
                                                                   priv->base_qpn,
-                                                                 MLX4_FS_PROMISC_ALL_MULTI);
+                                                                 MLX4_FS_MC_DEFAULT);
                                 break;
  
                         case MLX4_STEERING_MODE_B0:
@@ -942,7 +942,7 @@ static void mlx4_en_do_multicast(struct mlx4_en_priv *priv,
                         case MLX4_STEERING_MODE_DEVICE_MANAGED:
                                 err = mlx4_flow_steer_promisc_remove(mdev->dev,
                                                                      priv->port,
-                                                                    MLX4_FS_PROMISC_ALL_MULTI);
+                                                                    MLX4_FS_MC_DEFAULT);
                                 break;
  
                         case MLX4_STEERING_MODE_B0:
@@ -1621,10 +1621,10 @@ void mlx4_en_stop_port(struct net_device *dev, int detach)
                                  MLX4_EN_FLAG_MC_PROMISC);
                 mlx4_flow_steer_promisc_remove(mdev->dev,
                                                priv->port,
-                                              MLX4_FS_PROMISC_UPLINK);
+                                              MLX4_FS_ALL_DEFAULT);
                 mlx4_flow_steer_promisc_remove(mdev->dev,
                                                priv->port,
-                                              MLX4_FS_PROMISC_ALL_MULTI);
+                                              MLX4_FS_MC_DEFAULT);
         } else if (priv->flags & MLX4_EN_FLAG_PROMISC) {
                 priv->flags &= ~MLX4_EN_FLAG_PROMISC;
  
diff --git a/drivers/net/ethernet/mellanox/mlx4/eq.c b/drivers/net/ethernet/mellanox/mlx4/eq.c

index 8e3123a1df886de6b0afb2c60397ecd28be3e4b9..6000342f9725db0f29e8bac3a249765e53c90173 100644 (file)
--- a/drivers/net/ethernet/mellanox/mlx4/eq.c
+++ b/drivers/net/ethernet/mellanox/mlx4/eq.c
@@ -497,8 +497,8 @@ static int mlx4_eq_int(struct mlx4_dev *dev, struct mlx4_eq *eq)
                         break;
  
                 case MLX4_EVENT_TYPE_SRQ_LIMIT:
-                       mlx4_warn(dev, "%s: MLX4_EVENT_TYPE_SRQ_LIMIT\n",
-                                 __func__);
+                       mlx4_dbg(dev, "%s: MLX4_EVENT_TYPE_SRQ_LIMIT\n",
+                                __func__);
                 case MLX4_EVENT_TYPE_SRQ_CATAS_ERROR:
                         if (mlx4_is_master(dev)) {
                                 /* forward only to slave owning the SRQ */
diff --git a/drivers/net/ethernet/mellanox/mlx4/mcg.c b/drivers/net/ethernet/mellanox/mlx4/mcg.c

index ffc78d2cb0cf648341e26733fac5e6508388fd3a..f3e804f2a35f0bd2a9be0e32b7b577bed5518cfa 100644 (file)
--- a/drivers/net/ethernet/mellanox/mlx4/mcg.c
+++ b/drivers/net/ethernet/mellanox/mlx4/mcg.c
@@ -645,25 +645,37 @@ static int find_entry(struct mlx4_dev *dev, u8 port,
         return err;
  }
  
+static const u8 __promisc_mode[] = {
+       [MLX4_FS_REGULAR]   = 0x0,
+       [MLX4_FS_ALL_DEFAULT] = 0x1,
+       [MLX4_FS_MC_DEFAULT] = 0x3,
+       [MLX4_FS_UC_SNIFFER] = 0x4,
+       [MLX4_FS_MC_SNIFFER] = 0x5,
+};
+
+int mlx4_map_sw_to_hw_steering_mode(struct mlx4_dev *dev,
+                                   enum mlx4_net_trans_promisc_mode flow_type)
+{
+       if (flow_type >= MLX4_FS_MODE_NUM || flow_type < 0) {
+               mlx4_err(dev, "Invalid flow type. type = %d\n", flow_type);
+               return -EINVAL;
+       }
+       return __promisc_mode[flow_type];
+}
+EXPORT_SYMBOL_GPL(mlx4_map_sw_to_hw_steering_mode);
+
  static void trans_rule_ctrl_to_hw(struct mlx4_net_trans_rule *ctrl,
                                   struct mlx4_net_trans_rule_hw_ctrl *hw)
  {
-       static const u8 __promisc_mode[] = {
-               [MLX4_FS_PROMISC_NONE]   = 0x0,
-               [MLX4_FS_PROMISC_UPLINK] = 0x1,
-               [MLX4_FS_PROMISC_FUNCTION_PORT] = 0x2,
-               [MLX4_FS_PROMISC_ALL_MULTI] = 0x3,
-       };
-
-       u32 dw = 0;
-
-       dw = ctrl->queue_mode == MLX4_NET_TRANS_Q_LIFO ? 1 : 0;
-       dw |= ctrl->exclusive ? (1 << 2) : 0;
-       dw |= ctrl->allow_loopback ? (1 << 3) : 0;
-       dw |= __promisc_mode[ctrl->promisc_mode] << 8;
-       dw |= ctrl->priority << 16;
-
-       hw->ctrl = cpu_to_be32(dw);
+       u8 flags = 0;
+
+       flags = ctrl->queue_mode == MLX4_NET_TRANS_Q_LIFO ? 1 : 0;
+       flags |= ctrl->exclusive ? (1 << 2) : 0;
+       flags |= ctrl->allow_loopback ? (1 << 3) : 0;
+
+       hw->flags = flags;
+       hw->type = __promisc_mode[ctrl->promisc_mode];
+       hw->prio = cpu_to_be16(ctrl->priority);
         hw->port = ctrl->port;
         hw->qpn = cpu_to_be32(ctrl->qpn);
  }
@@ -677,29 +689,51 @@ const u16 __sw_id_hw[] = {
         [MLX4_NET_TRANS_RULE_ID_UDP]     = 0xE006
  };
  
+int mlx4_map_sw_to_hw_steering_id(struct mlx4_dev *dev,
+                                 enum mlx4_net_trans_rule_id id)
+{
+       if (id >= MLX4_NET_TRANS_RULE_NUM || id < 0) {
+               mlx4_err(dev, "Invalid network rule id. id = %d\n", id);
+               return -EINVAL;
+       }
+       return __sw_id_hw[id];
+}
+EXPORT_SYMBOL_GPL(mlx4_map_sw_to_hw_steering_id);
+
+static const int __rule_hw_sz[] = {
+       [MLX4_NET_TRANS_RULE_ID_ETH] =
+               sizeof(struct mlx4_net_trans_rule_hw_eth),
+       [MLX4_NET_TRANS_RULE_ID_IB] =
+               sizeof(struct mlx4_net_trans_rule_hw_ib),
+       [MLX4_NET_TRANS_RULE_ID_IPV6] = 0,
+       [MLX4_NET_TRANS_RULE_ID_IPV4] =
+               sizeof(struct mlx4_net_trans_rule_hw_ipv4),
+       [MLX4_NET_TRANS_RULE_ID_TCP] =
+               sizeof(struct mlx4_net_trans_rule_hw_tcp_udp),
+       [MLX4_NET_TRANS_RULE_ID_UDP] =
+               sizeof(struct mlx4_net_trans_rule_hw_tcp_udp)
+};
+
+int mlx4_hw_rule_sz(struct mlx4_dev *dev,
+              enum mlx4_net_trans_rule_id id)
+{
+       if (id >= MLX4_NET_TRANS_RULE_NUM || id < 0) {
+               mlx4_err(dev, "Invalid network rule id. id = %d\n", id);
+               return -EINVAL;
+       }
+
+       return __rule_hw_sz[id];
+}
+EXPORT_SYMBOL_GPL(mlx4_hw_rule_sz);
+
  static int parse_trans_rule(struct mlx4_dev *dev, struct mlx4_spec_list *spec,
                             struct _rule_hw *rule_hw)
  {
-       static const size_t __rule_hw_sz[] = {
-               [MLX4_NET_TRANS_RULE_ID_ETH] =
-                       sizeof(struct mlx4_net_trans_rule_hw_eth),
-               [MLX4_NET_TRANS_RULE_ID_IB] =
-                       sizeof(struct mlx4_net_trans_rule_hw_ib),
-               [MLX4_NET_TRANS_RULE_ID_IPV6] = 0,
-               [MLX4_NET_TRANS_RULE_ID_IPV4] =
-                       sizeof(struct mlx4_net_trans_rule_hw_ipv4),
-               [MLX4_NET_TRANS_RULE_ID_TCP] =
-                       sizeof(struct mlx4_net_trans_rule_hw_tcp_udp),
-               [MLX4_NET_TRANS_RULE_ID_UDP] =
-                       sizeof(struct mlx4_net_trans_rule_hw_tcp_udp)
-       };
-       if (spec->id >= MLX4_NET_TRANS_RULE_NUM) {
-               mlx4_err(dev, "Invalid network rule id. id = %d\n", spec->id);
+       if (mlx4_hw_rule_sz(dev, spec->id) < 0)
                 return -EINVAL;
-       }
-       memset(rule_hw, 0, __rule_hw_sz[spec->id]);
+       memset(rule_hw, 0, mlx4_hw_rule_sz(dev, spec->id));
         rule_hw->id = cpu_to_be16(__sw_id_hw[spec->id]);
-       rule_hw->size = __rule_hw_sz[spec->id] >> 2;
+       rule_hw->size = mlx4_hw_rule_sz(dev, spec->id) >> 2;
  
         switch (spec->id) {
         case MLX4_NET_TRANS_RULE_ID_ETH:
@@ -713,12 +747,12 @@ static int parse_trans_rule(struct mlx4_dev *dev, struct mlx4_spec_list *spec,
                         rule_hw->eth.ether_type_enable = 1;
                         rule_hw->eth.ether_type = spec->eth.ether_type;
                 }
-               rule_hw->eth.vlan_id = spec->eth.vlan_id;
-               rule_hw->eth.vlan_id_msk = spec->eth.vlan_id_msk;
+               rule_hw->eth.vlan_tag = spec->eth.vlan_id;
+               rule_hw->eth.vlan_tag_msk = spec->eth.vlan_id_msk;
                 break;
  
         case MLX4_NET_TRANS_RULE_ID_IB:
-               rule_hw->ib.qpn = spec->ib.r_qpn;
+               rule_hw->ib.l3_qpn = spec->ib.l3_qpn;
                 rule_hw->ib.qpn_mask = spec->ib.qpn_msk;
                 memcpy(&rule_hw->ib.dst_gid, &spec->ib.dst_gid, 16);
                 memcpy(&rule_hw->ib.dst_gid_msk, &spec->ib.dst_gid_msk, 16);
@@ -1136,7 +1170,7 @@ int mlx4_trans_to_dmfs_attach(struct mlx4_dev *dev, struct mlx4_qp *qp,
                 struct mlx4_net_trans_rule rule = {
                         .queue_mode = MLX4_NET_TRANS_Q_FIFO,
                         .exclusive = 0,
-                       .promisc_mode = MLX4_FS_PROMISC_NONE,
+                       .promisc_mode = MLX4_FS_REGULAR,
                         .priority = MLX4_DOMAIN_NIC,
                 };
  
@@ -1229,11 +1263,10 @@ int mlx4_flow_steer_promisc_add(struct mlx4_dev *dev, u8 port,
         u64 *regid_p;
  
         switch (mode) {
-       case MLX4_FS_PROMISC_UPLINK:
-       case MLX4_FS_PROMISC_FUNCTION_PORT:
+       case MLX4_FS_ALL_DEFAULT:
                 regid_p = &dev->regid_promisc_array[port];
                 break;
-       case MLX4_FS_PROMISC_ALL_MULTI:
+       case MLX4_FS_MC_DEFAULT:
                 regid_p = &dev->regid_allmulti_array[port];
                 break;
         default:
@@ -1260,11 +1293,10 @@ int mlx4_flow_steer_promisc_remove(struct mlx4_dev *dev, u8 port,
         u64 *regid_p;
  
         switch (mode) {
-       case MLX4_FS_PROMISC_UPLINK:
-       case MLX4_FS_PROMISC_FUNCTION_PORT:
+       case MLX4_FS_ALL_DEFAULT:
                 regid_p = &dev->regid_promisc_array[port];
                 break;
-       case MLX4_FS_PROMISC_ALL_MULTI:
+       case MLX4_FS_MC_DEFAULT:
                 regid_p = &dev->regid_allmulti_array[port];
                 break;
         default:
diff --git a/drivers/net/ethernet/mellanox/mlx4/mlx4.h b/drivers/net/ethernet/mellanox/mlx4/mlx4.h

index eac3dae10efe4170d7ed7c276eaccca1ff2434fa..df15bb6631cc7d6f68b70191e891321f505fe00b 100644 (file)
--- a/drivers/net/ethernet/mellanox/mlx4/mlx4.h
+++ b/drivers/net/ethernet/mellanox/mlx4/mlx4.h
@@ -730,85 +730,6 @@ struct mlx4_steer {
         struct list_head steer_entries[MLX4_NUM_STEERS];
  };
  
-struct mlx4_net_trans_rule_hw_ctrl {
-       __be32 ctrl;
-       u8 rsvd1;
-       u8 funcid;
-       u8 vep;
-       u8 port;
-       __be32 qpn;
-       __be32 rsvd2;
-};
-
-struct mlx4_net_trans_rule_hw_ib {
-       u8 size;
-       u8 rsvd1;
-       __be16 id;
-       u32 rsvd2;
-       __be32 qpn;
-       __be32 qpn_mask;
-       u8 dst_gid[16];
-       u8 dst_gid_msk[16];
-} __packed;
-
-struct mlx4_net_trans_rule_hw_eth {
-       u8      size;
-       u8      rsvd;
-       __be16  id;
-       u8      rsvd1[6];
-       u8      dst_mac[6];
-       u16     rsvd2;
-       u8      dst_mac_msk[6];
-       u16     rsvd3;
-       u8      src_mac[6];
-       u16     rsvd4;
-       u8      src_mac_msk[6];
-       u8      rsvd5;
-       u8      ether_type_enable;
-       __be16  ether_type;
-       __be16  vlan_id_msk;
-       __be16  vlan_id;
-} __packed;
-
-struct mlx4_net_trans_rule_hw_tcp_udp {
-       u8      size;
-       u8      rsvd;
-       __be16  id;
-       __be16  rsvd1[3];
-       __be16  dst_port;
-       __be16  rsvd2;
-       __be16  dst_port_msk;
-       __be16  rsvd3;
-       __be16  src_port;
-       __be16  rsvd4;
-       __be16  src_port_msk;
-} __packed;
-
-struct mlx4_net_trans_rule_hw_ipv4 {
-       u8      size;
-       u8      rsvd;
-       __be16  id;
-       __be32  rsvd1;
-       __be32  dst_ip;
-       __be32  dst_ip_msk;
-       __be32  src_ip;
-       __be32  src_ip_msk;
-} __packed;
-
-struct _rule_hw {
-       union {
-               struct {
-                       u8 size;
-                       u8 rsvd;
-                       __be16 id;
-               };
-               struct mlx4_net_trans_rule_hw_eth eth;
-               struct mlx4_net_trans_rule_hw_ib ib;
-               struct mlx4_net_trans_rule_hw_ipv4 ipv4;
-               struct mlx4_net_trans_rule_hw_tcp_udp tcp_udp;
-       };
-};
-
  enum {
         MLX4_PCI_DEV_IS_VF              = 1 << 0,
         MLX4_PCI_DEV_FORCE_SENSE_PORT   = 1 << 1,
diff --git a/drivers/net/ethernet/mellanox/mlx4/srq.c b/drivers/net/ethernet/mellanox/mlx4/srq.c

index e329fe1f11b736d8e717880c7e6e7e092db2a14c..79fd269e2c54b3e0a043ab072a4e2b0f888e0088 100644 (file)
--- a/drivers/net/ethernet/mellanox/mlx4/srq.c
+++ b/drivers/net/ethernet/mellanox/mlx4/srq.c
@@ -298,3 +298,18 @@ void mlx4_cleanup_srq_table(struct mlx4_dev *dev)
                 return;
         mlx4_bitmap_cleanup(&mlx4_priv(dev)->srq_table.bitmap);
  }
+
+struct mlx4_srq *mlx4_srq_lookup(struct mlx4_dev *dev, u32 srqn)
+{
+       struct mlx4_srq_table *srq_table = &mlx4_priv(dev)->srq_table;
+       struct mlx4_srq *srq;
+       unsigned long flags;
+
+       spin_lock_irqsave(&srq_table->lock, flags);
+       srq = radix_tree_lookup(&srq_table->tree,
+                               srqn & (dev->caps.num_srqs - 1));
+       spin_unlock_irqrestore(&srq_table->lock, flags);
+
+       return srq;
+}
+EXPORT_SYMBOL_GPL(mlx4_srq_lookup);
diff --git a/drivers/net/ethernet/sfc/ptp.c b/drivers/net/ethernet/sfc/ptp.c

index 07f6baa15c0cd4cc075130d45130729eb77c7d1b..9a95abf2dedfad3dce17f6739eaba93a23e4c105 100644 (file)
--- a/drivers/net/ethernet/sfc/ptp.c
+++ b/drivers/net/ethernet/sfc/ptp.c
@@ -912,8 +912,10 @@ static int efx_ptp_probe_channel(struct efx_channel *channel)
  
         ptp->phc_clock = ptp_clock_register(&ptp->phc_clock_info,
                                             &efx->pci_dev->dev);
-       if (!ptp->phc_clock)
+       if (IS_ERR(ptp->phc_clock)) {
+               rc = PTR_ERR(ptp->phc_clock);
                 goto fail3;
+       }
  
         INIT_WORK(&ptp->pps_work, efx_ptp_pps_worker);
         ptp->pps_workwq = create_singlethread_workqueue("sfc_pps");
diff --git a/drivers/net/ethernet/toshiba/spider_net.c b/drivers/net/ethernet/toshiba/spider_net.c

index c655fe60121ee3694297f289ef3d4529d3081a9d..5734480c1ecfa5f91a134720fe478bb68826fc50 100644 (file)
--- a/drivers/net/ethernet/toshiba/spider_net.c
+++ b/drivers/net/ethernet/toshiba/spider_net.c
@@ -1990,7 +1990,8 @@ spider_net_open(struct net_device *netdev)
                 goto alloc_rx_failed;
  
         /* Allocate rx skbs */
-       if (spider_net_alloc_rx_skbs(card))
+       result = spider_net_alloc_rx_skbs(card);
+       if (result)
                 goto alloc_skbs_failed;
  
         spider_net_set_multi(netdev);
diff --git a/drivers/net/hamradio/baycom_epp.c b/drivers/net/hamradio/baycom_epp.c

index 49b8b58fc5c6f7823a9507ab2a0d75ab7d51dcac..484f77ec2ce1f439d21caad0656fc6d2924b1577 100644 (file)
--- a/drivers/net/hamradio/baycom_epp.c
+++ b/drivers/net/hamradio/baycom_epp.c
@@ -449,7 +449,7 @@ static int transmit(struct baycom_state *bc, int cnt, unsigned char stat)
                         if ((--bc->hdlctx.slotcnt) > 0)
                                 return 0;
                         bc->hdlctx.slotcnt = bc->ch_params.slottime;
-                       if ((random32() % 256) > bc->ch_params.ppersist)
+                       if ((prandom_u32() % 256) > bc->ch_params.ppersist)
                                 return 0;
                 }
         }
diff --git a/drivers/net/hamradio/hdlcdrv.c b/drivers/net/hamradio/hdlcdrv.c

index a4a3516b6bbf87d05545a48d8e46627a5f81ee7c..3169252613faae400904201fa5ad9f1a7decf1cf 100644 (file)
--- a/drivers/net/hamradio/hdlcdrv.c
+++ b/drivers/net/hamradio/hdlcdrv.c
@@ -389,7 +389,7 @@ void hdlcdrv_arbitrate(struct net_device *dev, struct hdlcdrv_state *s)
         if ((--s->hdlctx.slotcnt) > 0)
                 return;
         s->hdlctx.slotcnt = s->ch_params.slottime;
-       if ((random32() % 256) > s->ch_params.ppersist)
+       if ((prandom_u32() % 256) > s->ch_params.ppersist)
                 return;
         start_tx(dev, s);
  }
diff --git a/drivers/net/hamradio/yam.c b/drivers/net/hamradio/yam.c

index b2d863f2ea4255a2f9cc61bda6a9fdff8ab5623c..0721e72f9299250c6c29f0e1d3f7f51affffe7e8 100644 (file)
--- a/drivers/net/hamradio/yam.c
+++ b/drivers/net/hamradio/yam.c
@@ -638,7 +638,7 @@ static void yam_arbitrate(struct net_device *dev)
         yp->slotcnt = yp->slot / 10;
  
         /* is random > persist ? */
-       if ((random32() % 256) > yp->pers)
+       if ((prandom_u32() % 256) > yp->pers)
                 return;
  
         yam_start_tx(dev, yp);
diff --git a/drivers/net/irda/bfin_sir.c b/drivers/net/irda/bfin_sir.c

index a06fca61c9a08f308cb0ef82f1d086cf59d0ad33..22b4527321b1cf6a99140f8845ba40137f8cf451 100644 (file)
--- a/drivers/net/irda/bfin_sir.c
+++ b/drivers/net/irda/bfin_sir.c
@@ -609,7 +609,7 @@ static int bfin_sir_open(struct net_device *dev)
  {
         struct bfin_sir_self *self = netdev_priv(dev);
         struct bfin_sir_port *port = self->sir_port;
-       int err = -ENOMEM;
+       int err;
  
         self->newspeed = 0;
         self->speed = 9600;
@@ -623,8 +623,10 @@ static int bfin_sir_open(struct net_device *dev)
         bfin_sir_set_speed(port, 9600);
  
         self->irlap = irlap_open(dev, &self->qos, DRIVER_NAME);
-       if (!self->irlap)
+       if (!self->irlap) {
+               err = -ENOMEM;
                 goto err_irlap;
+       }
  
         INIT_WORK(&self->work, bfin_sir_send_work);
  
diff --git a/drivers/net/phy/Kconfig b/drivers/net/phy/Kconfig

index 450345261bd32a0ce19564e4120c2f622f4962e2..1e11f2bfd9cef59bbe1f1e563c28918c6ecc7f16 100644 (file)
--- a/drivers/net/phy/Kconfig
+++ b/drivers/net/phy/Kconfig
@@ -126,7 +126,7 @@ config MDIO_BITBANG
  
  config MDIO_GPIO
         tristate "Support for GPIO lib-based bitbanged MDIO buses"
-       depends on MDIO_BITBANG && GENERIC_GPIO
+       depends on MDIO_BITBANG && GPIOLIB
         ---help---
           Supports GPIO lib-based MDIO busses.
  
diff --git a/drivers/net/team/team_mode_random.c b/drivers/net/team/team_mode_random.c

index 9eabfaa22f3e0eb7690806375d1f21f69ba618fe..5ca14d463ba7d897931b41b8c8f515a67ff9829c 100644 (file)
--- a/drivers/net/team/team_mode_random.c
+++ b/drivers/net/team/team_mode_random.c
@@ -18,7 +18,7 @@
  
  static u32 random_N(unsigned int N)
  {
-       return reciprocal_divide(random32(), N);
+       return reciprocal_divide(prandom_u32(), N);
  }
  
  static bool rnd_transmit(struct team *team, struct sk_buff *skb)
diff --git a/drivers/net/usb/cdc_ether.c b/drivers/net/usb/cdc_ether.c

index 24fbec27a22a93c144f255cfdb72e5ab7a7dad63..078795fe6e312f22348d381e03b07c4274e07f87 100644 (file)
--- a/drivers/net/usb/cdc_ether.c
+++ b/drivers/net/usb/cdc_ether.c
@@ -613,6 +613,13 @@ static const struct usb_device_id  products [] = {
         .driver_info = 0,
  },
  
+/* Dell Wireless 5804 (Novatel E371) - handled by qmi_wwan */
+{
+       USB_DEVICE_AND_INTERFACE_INFO(DELL_VENDOR_ID, 0x819b, USB_CLASS_COMM,
+                       USB_CDC_SUBCLASS_ETHERNET, USB_CDC_PROTO_NONE),
+       .driver_info = 0,
+},
+
  /* AnyDATA ADU960S - handled by qmi_wwan */
  {
         USB_DEVICE_AND_INTERFACE_INFO(0x16d5, 0x650a, USB_CLASS_COMM,
diff --git a/drivers/net/usb/qmi_wwan.c b/drivers/net/usb/qmi_wwan.c

index 834e405fb57abebb9b2c6c6373c5ca09f4da46bf..cf887c2384e95004547bf56ef7dc8ea0a67e66e5 100644 (file)
--- a/drivers/net/usb/qmi_wwan.c
+++ b/drivers/net/usb/qmi_wwan.c
@@ -501,6 +501,13 @@ static const struct usb_device_id products[] = {
                                               USB_CDC_PROTO_NONE),
                 .driver_info        = (unsigned long)&qmi_wwan_info,
         },
+       {       /* Dell Wireless 5804 (Novatel E371) */
+               USB_DEVICE_AND_INTERFACE_INFO(0x413C, 0x819b,
+                                             USB_CLASS_COMM,
+                                             USB_CDC_SUBCLASS_ETHERNET,
+                                             USB_CDC_PROTO_NONE),
+               .driver_info        = (unsigned long)&qmi_wwan_info,
+       },
         {       /* ADU960S */
                 USB_DEVICE_AND_INTERFACE_INFO(0x16d5, 0x650a,
                                               USB_CLASS_COMM,
diff --git a/drivers/net/usb/sierra_net.c b/drivers/net/usb/sierra_net.c

index a923d61c6fc54ab676ade398e8704f7c24a66a0d..a79e9d3349284ac7c8371ae495d03110f87a3d42 100644 (file)
--- a/drivers/net/usb/sierra_net.c
+++ b/drivers/net/usb/sierra_net.c
@@ -426,6 +426,13 @@ static void sierra_net_dosync(struct usbnet *dev)
  
         dev_dbg(&dev->udev->dev, "%s", __func__);
  
+       /* The SIERRA_NET_HIP_MSYNC_ID command appears to request that the
+        * firmware restart itself.  After restarting, the modem will respond
+        * with the SIERRA_NET_HIP_RESTART_ID indication.  The driver continues
+        * sending MSYNC commands every few seconds until it receives the
+        * RESTART event from the firmware
+        */
+
         /* tell modem we are ready */
         status = sierra_net_send_sync(dev);
         if (status < 0)
@@ -704,6 +711,9 @@ static int sierra_net_bind(struct usbnet *dev, struct usb_interface *intf)
         /* set context index initially to 0 - prepares tx hdr template */
         sierra_net_set_ctx_index(priv, 0);
  
+       /* prepare sync message template */
+       memcpy(priv->sync_msg, sync_tmplate, sizeof(priv->sync_msg));
+
         /* decrease the rx_urb_size and max_tx_size to 4k on USB 1.1 */
         dev->rx_urb_size  = SIERRA_NET_RX_URB_SIZE;
         if (dev->udev->speed != USB_SPEED_HIGH)
@@ -739,11 +749,6 @@ static int sierra_net_bind(struct usbnet *dev, struct usb_interface *intf)
                 kfree(priv);
                 return -ENODEV;
         }
-       /* prepare sync message from template */
-       memcpy(priv->sync_msg, sync_tmplate, sizeof(priv->sync_msg));
-
-       /* initiate the sync sequence */
-       sierra_net_dosync(dev);
  
         return 0;
  }
@@ -766,8 +771,9 @@ static void sierra_net_unbind(struct usbnet *dev, struct usb_interface *intf)
                 netdev_err(dev->net,
                         "usb_control_msg failed, status %d\n", status);
  
-       sierra_net_set_private(dev, NULL);
+       usbnet_status_stop(dev);
  
+       sierra_net_set_private(dev, NULL);
         kfree(priv);
  }
  
@@ -908,6 +914,24 @@ static const struct driver_info sierra_net_info_direct_ip = {
         .tx_fixup = sierra_net_tx_fixup,
  };
  
+static int
+sierra_net_probe(struct usb_interface *udev, const struct usb_device_id *prod)
+{
+       int ret;
+
+       ret = usbnet_probe(udev, prod);
+       if (ret == 0) {
+               struct usbnet *dev = usb_get_intfdata(udev);
+
+               ret = usbnet_status_start(dev, GFP_KERNEL);
+               if (ret == 0) {
+                       /* Interrupt URB now set up; initiate sync sequence */
+                       sierra_net_dosync(dev);
+               }
+       }
+       return ret;
+}
+
  #define DIRECT_IP_DEVICE(vend, prod) \
         {USB_DEVICE_INTERFACE_NUMBER(vend, prod, 7), \
         .driver_info = (unsigned long)&sierra_net_info_direct_ip}, \
@@ -930,7 +954,7 @@ MODULE_DEVICE_TABLE(usb, products);
  static struct usb_driver sierra_net_driver = {
         .name = "sierra_net",
         .id_table = products,
-       .probe = usbnet_probe,
+       .probe = sierra_net_probe,
         .disconnect = usbnet_disconnect,
         .suspend = usbnet_suspend,
         .resume = usbnet_resume,
diff --git a/drivers/net/usb/usbnet.c b/drivers/net/usb/usbnet.c

index 1e5a9b72650e9946e23040f1169421bf87d18317..f95cb032394bb03f05b7ae2605b7b9585e8e3c67 100644 (file)
--- a/drivers/net/usb/usbnet.c
+++ b/drivers/net/usb/usbnet.c
@@ -252,6 +252,70 @@ static int init_status (struct usbnet *dev, struct usb_interface *intf)
         return 0;
  }
  
+/* Submit the interrupt URB if not previously submitted, increasing refcount */
+int usbnet_status_start(struct usbnet *dev, gfp_t mem_flags)
+{
+       int ret = 0;
+
+       WARN_ON_ONCE(dev->interrupt == NULL);
+       if (dev->interrupt) {
+               mutex_lock(&dev->interrupt_mutex);
+
+               if (++dev->interrupt_count == 1)
+                       ret = usb_submit_urb(dev->interrupt, mem_flags);
+
+               dev_dbg(&dev->udev->dev, "incremented interrupt URB count to %d\n",
+                       dev->interrupt_count);
+               mutex_unlock(&dev->interrupt_mutex);
+       }
+       return ret;
+}
+EXPORT_SYMBOL_GPL(usbnet_status_start);
+
+/* For resume; submit interrupt URB if previously submitted */
+static int __usbnet_status_start_force(struct usbnet *dev, gfp_t mem_flags)
+{
+       int ret = 0;
+
+       mutex_lock(&dev->interrupt_mutex);
+       if (dev->interrupt_count) {
+               ret = usb_submit_urb(dev->interrupt, mem_flags);
+               dev_dbg(&dev->udev->dev,
+                       "submitted interrupt URB for resume\n");
+       }
+       mutex_unlock(&dev->interrupt_mutex);
+       return ret;
+}
+
+/* Kill the interrupt URB if all submitters want it killed */
+void usbnet_status_stop(struct usbnet *dev)
+{
+       if (dev->interrupt) {
+               mutex_lock(&dev->interrupt_mutex);
+               WARN_ON(dev->interrupt_count == 0);
+
+               if (dev->interrupt_count && --dev->interrupt_count == 0)
+                       usb_kill_urb(dev->interrupt);
+
+               dev_dbg(&dev->udev->dev,
+                       "decremented interrupt URB count to %d\n",
+                       dev->interrupt_count);
+               mutex_unlock(&dev->interrupt_mutex);
+       }
+}
+EXPORT_SYMBOL_GPL(usbnet_status_stop);
+
+/* For suspend; always kill interrupt URB */
+static void __usbnet_status_stop_force(struct usbnet *dev)
+{
+       if (dev->interrupt) {
+               mutex_lock(&dev->interrupt_mutex);
+               usb_kill_urb(dev->interrupt);
+               dev_dbg(&dev->udev->dev, "killed interrupt URB for suspend\n");
+               mutex_unlock(&dev->interrupt_mutex);
+       }
+}
+
  /* Passes this packet up the stack, updating its accounting.
   * Some link protocols batch packets, so their rx_fixup paths
   * can return clones as well as just modify the original skb.
@@ -725,7 +789,7 @@ int usbnet_stop (struct net_device *net)
         if (!(info->flags & FLAG_AVOID_UNLINK_URBS))
                 usbnet_terminate_urbs(dev);
  
-       usb_kill_urb(dev->interrupt);
+       usbnet_status_stop(dev);
  
         usbnet_purge_paused_rxq(dev);
  
@@ -787,7 +851,7 @@ int usbnet_open (struct net_device *net)
  
         /* start any status interrupt transfer */
         if (dev->interrupt) {
-               retval = usb_submit_urb (dev->interrupt, GFP_KERNEL);
+               retval = usbnet_status_start(dev, GFP_KERNEL);
                 if (retval < 0) {
                         netif_err(dev, ifup, dev->net,
                                   "intr submit %d\n", retval);
@@ -1458,6 +1522,8 @@ usbnet_probe (struct usb_interface *udev, const struct usb_device_id *prod)
         dev->delay.data = (unsigned long) dev;
         init_timer (&dev->delay);
         mutex_init (&dev->phy_mutex);
+       mutex_init(&dev->interrupt_mutex);
+       dev->interrupt_count = 0;
  
         dev->net = net;
         strcpy (net->name, "usb%d");
@@ -1593,7 +1659,7 @@ int usbnet_suspend (struct usb_interface *intf, pm_message_t message)
                  */
                 netif_device_detach (dev->net);
                 usbnet_terminate_urbs(dev);
-               usb_kill_urb(dev->interrupt);
+               __usbnet_status_stop_force(dev);
  
                 /*
                  * reattach so runtime management can use and
@@ -1613,9 +1679,8 @@ int usbnet_resume (struct usb_interface *intf)
         int                     retval;
  
         if (!--dev->suspend_count) {
-               /* resume interrupt URBs */
-               if (dev->interrupt && test_bit(EVENT_DEV_OPEN, &dev->flags))
-                       usb_submit_urb(dev->interrupt, GFP_NOIO);
+               /* resume interrupt URB if it was previously submitted */
+               __usbnet_status_start_force(dev, GFP_NOIO);
  
                 spin_lock_irq(&dev->txq.lock);
                 while ((res = usb_get_from_anchor(&dev->deferred))) {
diff --git a/drivers/net/wireless/brcm80211/brcmfmac/p2p.c b/drivers/net/wireless/brcm80211/brcmfmac/p2p.c

index 2b90da0d85f3fcfb4e6ac7b0d6520b8e92bfa783..e7a1a47709966972891f96202996c77db40f3eec 100644 (file)
--- a/drivers/net/wireless/brcm80211/brcmfmac/p2p.c
+++ b/drivers/net/wireless/brcm80211/brcmfmac/p2p.c
@@ -1117,7 +1117,7 @@ static void brcmf_p2p_afx_handler(struct work_struct *work)
         if (afx_hdl->is_listen && afx_hdl->my_listen_chan)
                 /* 100ms ~ 300ms */
                 err = brcmf_p2p_discover_listen(p2p, afx_hdl->my_listen_chan,
-                                               100 * (1 + (random32() % 3)));
+                                               100 * (1 + prandom_u32() % 3));
         else
                 err = brcmf_p2p_act_frm_search(p2p, afx_hdl->peer_listen_chan);
  
diff --git a/drivers/net/wireless/mwifiex/cfg80211.c b/drivers/net/wireless/mwifiex/cfg80211.c

index a0cb0770d319b9e407ac85aacc88dc7c958b1efb..d3c8ece980d8aa2394439f84348af73ce7ab94ff 100644 (file)
--- a/drivers/net/wireless/mwifiex/cfg80211.c
+++ b/drivers/net/wireless/mwifiex/cfg80211.c
@@ -216,7 +216,7 @@ mwifiex_cfg80211_mgmt_tx(struct wiphy *wiphy, struct wireless_dev *wdev,
         mwifiex_form_mgmt_frame(skb, buf, len);
         mwifiex_queue_tx_pkt(priv, skb);
  
-       *cookie = random32() | 1;
+       *cookie = prandom_u32() | 1;
         cfg80211_mgmt_tx_status(wdev, *cookie, buf, len, true, GFP_ATOMIC);
  
         wiphy_dbg(wiphy, "info: management frame transmitted\n");
@@ -271,7 +271,7 @@ mwifiex_cfg80211_remain_on_channel(struct wiphy *wiphy,
                                          duration);
  
         if (!ret) {
-               *cookie = random32() | 1;
+               *cookie = prandom_u32() | 1;
                 priv->roc_cfg.cookie = *cookie;
                 priv->roc_cfg.chan = *chan;
  
diff --git a/drivers/of/of_mdio.c b/drivers/of/of_mdio.c

index 23049aeca662c7a1944daa127a2fc77c3dcd4410..d5a57a9e329c85de3e48722e8b9576f56dcea577 100644 (file)
--- a/drivers/of/of_mdio.c
+++ b/drivers/of/of_mdio.c
@@ -84,13 +84,10 @@ int of_mdiobus_register(struct mii_bus *mdio, struct device_node *np)
                 phy = get_phy_device(mdio, addr, is_c45);
  
                 if (!phy || IS_ERR(phy)) {
-                       phy = phy_device_create(mdio, addr, 0, false, NULL);
-                       if (!phy || IS_ERR(phy)) {
-                               dev_err(&mdio->dev,
-                                       "error creating PHY at address %i\n",
-                                       addr);
-                               continue;
-                       }
+                       dev_err(&mdio->dev,
+                               "cannot get PHY at address %i\n",
+                               addr);
+                       continue;
                 }
  
                 /* Associate the OF node with the device structure so it
diff --git a/drivers/parisc/sba_iommu.c b/drivers/parisc/sba_iommu.c

index 42cfcd9eb9aaeb3e8239e646a2634c4b23c54003..1ff1b67e8b274cc6869aab822694a1859954e132 100644 (file)
--- a/drivers/parisc/sba_iommu.c
+++ b/drivers/parisc/sba_iommu.c
@@ -575,7 +575,7 @@ sba_io_pdir_entry(u64 *pdir_ptr, space_t sid, unsigned long vba,
  
         mtsp(sid,1);
         asm("lci 0(%%sr1, %1), %0" : "=r" (ci) : "r" (vba));
-       pa |= (ci >> 12) & 0xff;  /* move CI (8 bits) into lowest byte */
+       pa |= (ci >> PAGE_SHIFT) & 0xff;  /* move CI (8 bits) into lowest byte */
  
         pa |= SBA_PDIR_VALID_BIT;       /* set "valid" bit */
         *pdir_ptr = cpu_to_le64(pa);    /* swap and store into I/O Pdir */
@@ -1376,7 +1376,7 @@ static void
  sba_ioc_init(struct parisc_device *sba, struct ioc *ioc, int ioc_num)
  {
         u32 iova_space_size, iova_space_mask;
-       unsigned int pdir_size, iov_order;
+       unsigned int pdir_size, iov_order, tcnfg;
  
         /*
         ** Determine IOVA Space size from memory size.
@@ -1468,8 +1468,19 @@ sba_ioc_init(struct parisc_device *sba, struct ioc *ioc, int ioc_num)
         WRITE_REG(ioc->ibase | 1, ioc->ioc_hpa+IOC_IBASE);
         WRITE_REG(ioc->imask, ioc->ioc_hpa+IOC_IMASK);
  
-       /* Set I/O PDIR Page size to 4K */
-       WRITE_REG(0, ioc->ioc_hpa+IOC_TCNFG);
+       /* Set I/O PDIR Page size to system page size */
+       switch (PAGE_SHIFT) {
+               case 12: tcnfg = 0; break;      /*  4K */
+               case 13: tcnfg = 1; break;      /*  8K */
+               case 14: tcnfg = 2; break;      /* 16K */
+               case 16: tcnfg = 3; break;      /* 64K */
+               default:
+                       panic(__FILE__ "Unsupported system page size %d",
+                               1 << PAGE_SHIFT);
+                       break;
+       }
+       /* Set I/O PDIR Page size to PAGE_SIZE (4k/16k/...) */
+       WRITE_REG(tcnfg, ioc->ioc_hpa+IOC_TCNFG);
  
         /*
         ** Clear I/O TLB of any possible entries.
diff --git a/drivers/pci/bus.c b/drivers/pci/bus.c

index 748f8f3e9ff52a4634e277452a2d7b74ac72f999..32e66a6f12d98084f77f493ed1d202084ad3329e 100644 (file)
--- a/drivers/pci/bus.c
+++ b/drivers/pci/bus.c
@@ -174,6 +174,7 @@ int pci_bus_add_device(struct pci_dev *dev)
          * Can not put in pci_device_add yet because resources
          * are not assigned yet for some devices.
          */
+       pci_fixup_device(pci_fixup_final, dev);
         pci_create_sysfs_dev_files(dev);
  
         dev->match_driver = true;
diff --git a/drivers/pci/msi.c b/drivers/pci/msi.c

index d40bed7267695fba2ed344cf1d28de56eac8659a..2c1075213beceac6bb0eb75be6aeef7adc8f34d7 100644 (file)
--- a/drivers/pci/msi.c
+++ b/drivers/pci/msi.c
@@ -563,8 +563,10 @@ static int msi_capability_init(struct pci_dev *dev, int nvec)
         entry->msi_attrib.default_irq   = dev->irq;     /* Save IOAPIC IRQ */
         entry->msi_attrib.pos           = dev->msi_cap;
  
-       entry->mask_pos = dev->msi_cap + (control & PCI_MSI_FLAGS_64BIT) ?
-               PCI_MSI_MASK_64 : PCI_MSI_MASK_32;
+       if (control & PCI_MSI_FLAGS_64BIT)
+               entry->mask_pos = dev->msi_cap + PCI_MSI_MASK_64;
+       else
+               entry->mask_pos = dev->msi_cap + PCI_MSI_MASK_32;
         /* All MSIs are unmasked by default, Mask them all */
         if (entry->msi_attrib.maskbit)
                 pci_read_config_dword(dev, entry->mask_pos, &entry->masked);
diff --git a/drivers/pci/probe.c b/drivers/pci/probe.c

index 631aeb7d2d2dd548d7db2ccfeae6b7518f0fc483..70f10fa3c1b216ab88e04471c171d8404f107770 100644 (file)
--- a/drivers/pci/probe.c
+++ b/drivers/pci/probe.c
@@ -1341,7 +1341,6 @@ void pci_device_add(struct pci_dev *dev, struct pci_bus *bus)
         list_add_tail(&dev->bus_list, &bus->devices);
         up_write(&pci_bus_sem);
  
-       pci_fixup_device(pci_fixup_final, dev);
         ret = pcibios_add_device(dev);
         WARN_ON(ret < 0);
  
diff --git a/drivers/pinctrl/sh-pfc/Kconfig b/drivers/pinctrl/sh-pfc/Kconfig

index 0e1f99c33d47d3cfcb6677ee4736cbe37edf80d4..f8a2ae413c7f1366901d66d1570eb2de87d3ce60 100644 (file)
--- a/drivers/pinctrl/sh-pfc/Kconfig
+++ b/drivers/pinctrl/sh-pfc/Kconfig
@@ -6,7 +6,7 @@ if ARCH_SHMOBILE || SUPERH
  
  config PINCTRL_SH_PFC
         # XXX move off the gpio dependency
-       depends on GENERIC_GPIO
+       depends on GPIOLIB
         select GPIO_SH_PFC if ARCH_REQUIRE_GPIOLIB
         select PINMUX
         select PINCONF
@@ -40,19 +40,19 @@ config PINCTRL_PFC_R8A7779
  config PINCTRL_PFC_SH7203
         def_bool y
         depends on CPU_SUBTYPE_SH7203
-       depends on GENERIC_GPIO
+       depends on GPIOLIB
         select PINCTRL_SH_PFC
  
  config PINCTRL_PFC_SH7264
         def_bool y
         depends on CPU_SUBTYPE_SH7264
-       depends on GENERIC_GPIO
+       depends on GPIOLIB
         select PINCTRL_SH_PFC
  
  config PINCTRL_PFC_SH7269
         def_bool y
         depends on CPU_SUBTYPE_SH7269
-       depends on GENERIC_GPIO
+       depends on GPIOLIB
         select PINCTRL_SH_PFC
  
  config PINCTRL_PFC_SH7372
@@ -68,55 +68,55 @@ config PINCTRL_PFC_SH73A0
  config PINCTRL_PFC_SH7720
         def_bool y
         depends on CPU_SUBTYPE_SH7720
-       depends on GENERIC_GPIO
+       depends on GPIOLIB
         select PINCTRL_SH_PFC
  
  config PINCTRL_PFC_SH7722
         def_bool y
         depends on CPU_SUBTYPE_SH7722
-       depends on GENERIC_GPIO
+       depends on GPIOLIB
         select PINCTRL_SH_PFC
  
  config PINCTRL_PFC_SH7723
         def_bool y
         depends on CPU_SUBTYPE_SH7723
-       depends on GENERIC_GPIO
+       depends on GPIOLIB
         select PINCTRL_SH_PFC
  
  config PINCTRL_PFC_SH7724
         def_bool y
         depends on CPU_SUBTYPE_SH7724
-       depends on GENERIC_GPIO
+       depends on GPIOLIB
         select PINCTRL_SH_PFC
  
  config PINCTRL_PFC_SH7734
         def_bool y
         depends on CPU_SUBTYPE_SH7734
-       depends on GENERIC_GPIO
+       depends on GPIOLIB
         select PINCTRL_SH_PFC
  
  config PINCTRL_PFC_SH7757
         def_bool y
         depends on CPU_SUBTYPE_SH7757
-       depends on GENERIC_GPIO
+       depends on GPIOLIB
         select PINCTRL_SH_PFC
  
  config PINCTRL_PFC_SH7785
         def_bool y
         depends on CPU_SUBTYPE_SH7785
-       depends on GENERIC_GPIO
+       depends on GPIOLIB
         select PINCTRL_SH_PFC
  
  config PINCTRL_PFC_SH7786
         def_bool y
         depends on CPU_SUBTYPE_SH7786
-       depends on GENERIC_GPIO
+       depends on GPIOLIB
         select PINCTRL_SH_PFC
  
  config PINCTRL_PFC_SHX3
         def_bool y
         depends on CPU_SUBTYPE_SHX3
-       depends on GENERIC_GPIO
+       depends on GPIOLIB
         select PINCTRL_SH_PFC
  
  endif
diff --git a/drivers/regulator/Kconfig b/drivers/regulator/Kconfig

index a5d97eaee99e3085ad5dd8c5c5d294511b374c5f..8bb26446037e297ff974200bf3c81e993ff62a47 100644 (file)
--- a/drivers/regulator/Kconfig
+++ b/drivers/regulator/Kconfig
@@ -66,7 +66,7 @@ config REGULATOR_USERSPACE_CONSUMER
  
  config REGULATOR_GPIO
         tristate "GPIO regulator support"
-       depends on GENERIC_GPIO
+       depends on GPIOLIB
         help
           This driver provides support for regulators that can be
           controlled via gpios.
diff --git a/drivers/remoteproc/Kconfig b/drivers/remoteproc/Kconfig

index c6d77e20622c45f54e17e7063e3e498f781739c6..d4d377c40ec96585126f474238040d47c9e79c2e 100644 (file)
--- a/drivers/remoteproc/Kconfig
+++ b/drivers/remoteproc/Kconfig
@@ -4,13 +4,15 @@ menu "Remoteproc drivers"
  config REMOTEPROC
         tristate
         depends on HAS_DMA
+       select CRC32
         select FW_LOADER
         select VIRTIO
+       select VIRTUALIZATION
  
  config OMAP_REMOTEPROC
         tristate "OMAP remoteproc support"
         depends on HAS_DMA
-       depends on ARCH_OMAP4
+       depends on ARCH_OMAP4 || SOC_OMAP5
         depends on OMAP_IOMMU
         depends on OMAP_MBOX_FWK
         select REMOTEPROC
@@ -38,4 +40,27 @@ config STE_MODEM_RPROC
           This can be either built-in or a loadable module.
           If unsure say N.
  
+config DA8XX_REMOTEPROC
+       tristate "DA8xx/OMAP-L13x remoteproc support"
+       depends on ARCH_DAVINCI_DA8XX
+       select CMA
+       select REMOTEPROC
+       select RPMSG
+       help
+         Say y here to support DA8xx/OMAP-L13x remote processors via the
+         remote processor framework.
+
+         You want to say y here in order to enable AMP
+         use-cases to run on your platform (multimedia codecs are
+         offloaded to remote DSP processors using this framework).
+
+         This module controls the name of the firmware file that gets
+         loaded on the DSP.  This file must reside in the /lib/firmware
+         directory.  It can be specified via the module parameter
+         da8xx_fw_name=<filename>, and if not specified will default to
+         "rproc-dsp-fw".
+
+         It's safe to say n here if you're not interested in multimedia
+         offloading.
+
  endmenu
diff --git a/drivers/remoteproc/Makefile b/drivers/remoteproc/Makefile

index 391b65181c054c795a8e0e34b1092f4ddba969f4..ac2ff75686d20da708d77728aca94f23cbdf1764 100644 (file)
--- a/drivers/remoteproc/Makefile
+++ b/drivers/remoteproc/Makefile
@@ -9,3 +9,4 @@ remoteproc-y                            += remoteproc_virtio.o
  remoteproc-y                           += remoteproc_elf_loader.o
  obj-$(CONFIG_OMAP_REMOTEPROC)          += omap_remoteproc.o
  obj-$(CONFIG_STE_MODEM_RPROC)          += ste_modem_rproc.o
+obj-$(CONFIG_DA8XX_REMOTEPROC)         += da8xx_remoteproc.o
diff --git a/drivers/remoteproc/da8xx_remoteproc.c b/drivers/remoteproc/da8xx_remoteproc.c

new file mode 100644 (file)

index 0000000..9b2e60a
--- /dev/null
+++ b/drivers/remoteproc/da8xx_remoteproc.c
@@ -0,0 +1,324 @@
+/*
+ * Remote processor machine-specific module for DA8XX
+ *
+ * Copyright (C) 2013 Texas Instruments, Inc.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * version 2 as published by the Free Software Foundation.
+ */
+
+#include <linux/bitops.h>
+#include <linux/clk.h>
+#include <linux/err.h>
+#include <linux/interrupt.h>
+#include <linux/io.h>
+#include <linux/irq.h>
+#include <linux/kernel.h>
+#include <linux/module.h>
+#include <linux/platform_device.h>
+#include <linux/remoteproc.h>
+
+#include <mach/clock.h>   /* for davinci_clk_reset_assert/deassert() */
+
+#include "remoteproc_internal.h"
+
+static char *da8xx_fw_name;
+module_param(da8xx_fw_name, charp, S_IRUGO);
+MODULE_PARM_DESC(da8xx_fw_name,
+                "\n\t\tName of DSP firmware file in /lib/firmware"
+                " (if not specified defaults to 'rproc-dsp-fw')");
+
+/*
+ * OMAP-L138 Technical References:
+ * http://www.ti.com/product/omap-l138
+ */
+#define SYSCFG_CHIPSIG0 BIT(0)
+#define SYSCFG_CHIPSIG1 BIT(1)
+#define SYSCFG_CHIPSIG2 BIT(2)
+#define SYSCFG_CHIPSIG3 BIT(3)
+#define SYSCFG_CHIPSIG4 BIT(4)
+
+/**
+ * struct da8xx_rproc - da8xx remote processor instance state
+ * @rproc: rproc handle
+ * @dsp_clk: placeholder for platform's DSP clk
+ * @ack_fxn: chip-specific ack function for ack'ing irq
+ * @irq_data: ack_fxn function parameter
+ * @chipsig: virt ptr to DSP interrupt registers (CHIPSIG & CHIPSIG_CLR)
+ * @bootreg: virt ptr to DSP boot address register (HOST1CFG)
+ * @irq: irq # used by this instance
+ */
+struct da8xx_rproc {
+       struct rproc *rproc;
+       struct clk *dsp_clk;
+       void (*ack_fxn)(struct irq_data *data);
+       struct irq_data *irq_data;
+       void __iomem *chipsig;
+       void __iomem *bootreg;
+       int irq;
+};
+
+/**
+ * handle_event() - inbound virtqueue message workqueue function
+ *
+ * This function is registered as a kernel thread and is scheduled by the
+ * kernel handler.
+ */
+static irqreturn_t handle_event(int irq, void *p)
+{
+       struct rproc *rproc = (struct rproc *)p;
+
+       /* Process incoming buffers on all our vrings */
+       rproc_vq_interrupt(rproc, 0);
+       rproc_vq_interrupt(rproc, 1);
+
+       return IRQ_HANDLED;
+}
+
+/**
+ * da8xx_rproc_callback() - inbound virtqueue message handler
+ *
+ * This handler is invoked directly by the kernel whenever the remote
+ * core (DSP) has modified the state of a virtqueue.  There is no
+ * "payload" message indicating the virtqueue index as is the case with
+ * mailbox-based implementations on OMAP4.  As such, this handler "polls"
+ * each known virtqueue index for every invocation.
+ */
+static irqreturn_t da8xx_rproc_callback(int irq, void *p)
+{
+       struct rproc *rproc = (struct rproc *)p;
+       struct da8xx_rproc *drproc = (struct da8xx_rproc *)rproc->priv;
+       u32 chipsig;
+
+       chipsig = readl(drproc->chipsig);
+       if (chipsig & SYSCFG_CHIPSIG0) {
+               /* Clear interrupt level source */
+               writel(SYSCFG_CHIPSIG0, drproc->chipsig + 4);
+
+               /*
+                * ACK intr to AINTC.
+                *
+                * It has already been ack'ed by the kernel before calling
+                * this function, but since the ARM<->DSP interrupts in the
+                * CHIPSIG register are "level" instead of "pulse" variety,
+                * we need to ack it after taking down the level else we'll
+                * be called again immediately after returning.
+                */
+               drproc->ack_fxn(drproc->irq_data);
+
+               return IRQ_WAKE_THREAD;
+       }
+
+       return IRQ_HANDLED;
+}
+
+static int da8xx_rproc_start(struct rproc *rproc)
+{
+       struct device *dev = rproc->dev.parent;
+       struct da8xx_rproc *drproc = (struct da8xx_rproc *)rproc->priv;
+       struct clk *dsp_clk = drproc->dsp_clk;
+
+       /* hw requires the start (boot) address be on 1KB boundary */
+       if (rproc->bootaddr & 0x3ff) {
+               dev_err(dev, "invalid boot address: must be aligned to 1KB\n");
+
+               return -EINVAL;
+       }
+
+       writel(rproc->bootaddr, drproc->bootreg);
+
+       clk_enable(dsp_clk);
+       davinci_clk_reset_deassert(dsp_clk);
+
+       return 0;
+}
+
+static int da8xx_rproc_stop(struct rproc *rproc)
+{
+       struct da8xx_rproc *drproc = rproc->priv;
+
+       clk_disable(drproc->dsp_clk);
+
+       return 0;
+}
+
+/* kick a virtqueue */
+static void da8xx_rproc_kick(struct rproc *rproc, int vqid)
+{
+       struct da8xx_rproc *drproc = (struct da8xx_rproc *)rproc->priv;
+
+       /* Interupt remote proc */
+       writel(SYSCFG_CHIPSIG2, drproc->chipsig);
+}
+
+static struct rproc_ops da8xx_rproc_ops = {
+       .start = da8xx_rproc_start,
+       .stop = da8xx_rproc_stop,
+       .kick = da8xx_rproc_kick,
+};
+
+static int reset_assert(struct device *dev)
+{
+       struct clk *dsp_clk;
+
+       dsp_clk = clk_get(dev, NULL);
+       if (IS_ERR(dsp_clk)) {
+               dev_err(dev, "clk_get error: %ld\n", PTR_ERR(dsp_clk));
+               return PTR_RET(dsp_clk);
+       }
+
+       davinci_clk_reset_assert(dsp_clk);
+       clk_put(dsp_clk);
+
+       return 0;
+}
+
+static int da8xx_rproc_probe(struct platform_device *pdev)
+{
+       struct device *dev = &pdev->dev;
+       struct da8xx_rproc *drproc;
+       struct rproc *rproc;
+       struct irq_data *irq_data;
+       struct resource *bootreg_res;
+       struct resource *chipsig_res;
+       struct clk *dsp_clk;
+       void __iomem *chipsig;
+       void __iomem *bootreg;
+       int irq;
+       int ret;
+
+       irq = platform_get_irq(pdev, 0);
+       if (irq < 0) {
+               dev_err(dev, "platform_get_irq(pdev, 0) error: %d\n", irq);
+               return irq;
+       }
+
+       irq_data = irq_get_irq_data(irq);
+       if (!irq_data) {
+               dev_err(dev, "irq_get_irq_data(%d): NULL\n", irq);
+               return -EINVAL;
+       }
+
+       bootreg_res = platform_get_resource(pdev, IORESOURCE_MEM, 0);
+       if (!bootreg_res) {
+               dev_err(dev,
+                       "platform_get_resource(IORESOURCE_MEM, 0): NULL\n");
+               return -EADDRNOTAVAIL;
+       }
+
+       chipsig_res = platform_get_resource(pdev, IORESOURCE_MEM, 1);
+       if (!chipsig_res) {
+               dev_err(dev,
+                       "platform_get_resource(IORESOURCE_MEM, 1): NULL\n");
+               return -EADDRNOTAVAIL;
+       }
+
+       bootreg = devm_ioremap_resource(dev, bootreg_res);
+       if (IS_ERR(bootreg))
+               return PTR_ERR(bootreg);
+
+       chipsig = devm_ioremap_resource(dev, chipsig_res);
+       if (IS_ERR(chipsig))
+               return PTR_ERR(chipsig);
+
+       dsp_clk = devm_clk_get(dev, NULL);
+       if (IS_ERR(dsp_clk)) {
+               dev_err(dev, "clk_get error: %ld\n", PTR_ERR(dsp_clk));
+
+               return PTR_ERR(dsp_clk);
+       }
+
+       rproc = rproc_alloc(dev, "dsp", &da8xx_rproc_ops, da8xx_fw_name,
+               sizeof(*drproc));
+       if (!rproc)
+               return -ENOMEM;
+
+       drproc = rproc->priv;
+       drproc->rproc = rproc;
+
+       platform_set_drvdata(pdev, rproc);
+
+       /* everything the ISR needs is now setup, so hook it up */
+       ret = devm_request_threaded_irq(dev, irq, da8xx_rproc_callback,
+                                       handle_event, 0, "da8xx-remoteproc",
+                                       rproc);
+       if (ret) {
+               dev_err(dev, "devm_request_threaded_irq error: %d\n", ret);
+               goto free_rproc;
+       }
+
+       /*
+        * rproc_add() can end up enabling the DSP's clk with the DSP
+        * *not* in reset, but da8xx_rproc_start() needs the DSP to be
+        * held in reset at the time it is called.
+        */
+       ret = reset_assert(dev);
+       if (ret)
+               goto free_rproc;
+
+       drproc->chipsig = chipsig;
+       drproc->bootreg = bootreg;
+       drproc->ack_fxn = irq_data->chip->irq_ack;
+       drproc->irq_data = irq_data;
+       drproc->irq = irq;
+       drproc->dsp_clk = dsp_clk;
+
+       ret = rproc_add(rproc);
+       if (ret) {
+               dev_err(dev, "rproc_add failed: %d\n", ret);
+               goto free_rproc;
+       }
+
+       return 0;
+
+free_rproc:
+       rproc_put(rproc);
+
+       return ret;
+}
+
+static int da8xx_rproc_remove(struct platform_device *pdev)
+{
+       struct device *dev = &pdev->dev;
+       struct rproc *rproc = platform_get_drvdata(pdev);
+       struct da8xx_rproc *drproc = (struct da8xx_rproc *)rproc->priv;
+
+       /*
+        * It's important to place the DSP in reset before going away,
+        * since a subsequent insmod of this module may enable the DSP's
+        * clock before its program/boot-address has been loaded and
+        * before this module's probe has had a chance to reset the DSP.
+        * Without the reset, the DSP can lockup permanently when it
+        * begins executing garbage.
+        */
+       reset_assert(dev);
+
+       /*
+        * The devm subsystem might end up releasing things before
+        * freeing the irq, thus allowing an interrupt to sneak in while
+        * the device is being removed.  This should prevent that.
+        */
+       disable_irq(drproc->irq);
+
+       devm_clk_put(dev, drproc->dsp_clk);
+
+       rproc_del(rproc);
+       rproc_put(rproc);
+
+       return 0;
+}
+
+static struct platform_driver da8xx_rproc_driver = {
+       .probe = da8xx_rproc_probe,
+       .remove = da8xx_rproc_remove,
+       .driver = {
+               .name = "davinci-rproc",
+               .owner = THIS_MODULE,
+       },
+};
+
+module_platform_driver(da8xx_rproc_driver);
+
+MODULE_LICENSE("GPL v2");
+MODULE_DESCRIPTION("DA8XX Remote Processor control driver");
diff --git a/drivers/remoteproc/remoteproc_core.c b/drivers/remoteproc/remoteproc_core.c

index 814af5ab8a72d2d5e979855bbffc86b487fe41f6..022dc635d01e4935ee84c0378e0d606e94028a39 100644 (file)
--- a/drivers/remoteproc/remoteproc_core.c
+++ b/drivers/remoteproc/remoteproc_core.c
@@ -37,6 +37,7 @@
  #include <linux/iommu.h>
  #include <linux/idr.h>
  #include <linux/elf.h>
+#include <linux/crc32.h>
  #include <linux/virtio_ids.h>
  #include <linux/virtio_ring.h>
  #include <asm/byteorder.h>
@@ -45,7 +46,8 @@
  
  typedef int (*rproc_handle_resources_t)(struct rproc *rproc,
                                 struct resource_table *table, int len);
-typedef int (*rproc_handle_resource_t)(struct rproc *rproc, void *, int avail);
+typedef int (*rproc_handle_resource_t)(struct rproc *rproc,
+                                void *, int offset, int avail);
  
  /* Unique indices for remoteproc devices */
  static DEFINE_IDA(rproc_dev_index);
@@ -192,6 +194,7 @@ int rproc_alloc_vring(struct rproc_vdev *rvdev, int i)
         struct rproc *rproc = rvdev->rproc;
         struct device *dev = &rproc->dev;
         struct rproc_vring *rvring = &rvdev->vring[i];
+       struct fw_rsc_vdev *rsc;
         dma_addr_t dma;
         void *va;
         int ret, size, notifyid;
@@ -202,7 +205,6 @@ int rproc_alloc_vring(struct rproc_vdev *rvdev, int i)
         /*
          * Allocate non-cacheable memory for the vring. In the future
          * this call will also configure the IOMMU for us
-        * TODO: let the rproc know the da of this vring
          */
         va = dma_alloc_coherent(dev->parent, size, &dma, GFP_KERNEL);
         if (!va) {
@@ -213,7 +215,6 @@ int rproc_alloc_vring(struct rproc_vdev *rvdev, int i)
         /*
          * Assign an rproc-wide unique index for this vring
          * TODO: assign a notifyid for rvdev updates as well
-        * TODO: let the rproc know the notifyid of this vring
          * TODO: support predefined notifyids (via resource table)
          */
         ret = idr_alloc(&rproc->notifyids, rvring, 0, 0, GFP_KERNEL);
@@ -224,9 +225,6 @@ int rproc_alloc_vring(struct rproc_vdev *rvdev, int i)
         }
         notifyid = ret;
  
-       /* Store largest notifyid */
-       rproc->max_notifyid = max(rproc->max_notifyid, notifyid);
-
         dev_dbg(dev, "vring%d: va %p dma %llx size %x idr %d\n", i, va,
                                 (unsigned long long)dma, size, notifyid);
  
@@ -234,6 +232,15 @@ int rproc_alloc_vring(struct rproc_vdev *rvdev, int i)
         rvring->dma = dma;
         rvring->notifyid = notifyid;
  
+       /*
+        * Let the rproc know the notifyid and da of this vring.
+        * Not all platforms use dma_alloc_coherent to automatically
+        * set up the iommu. In this case the device address (da) will
+        * hold the physical address and not the device address.
+        */
+       rsc = (void *)rproc->table_ptr + rvdev->rsc_offset;
+       rsc->vring[i].da = dma;
+       rsc->vring[i].notifyid = notifyid;
         return 0;
  }
  
@@ -268,25 +275,20 @@ rproc_parse_vring(struct rproc_vdev *rvdev, struct fw_rsc_vdev *rsc, int i)
         return 0;
  }
  
-static int rproc_max_notifyid(int id, void *p, void *data)
-{
-       int *maxid = data;
-       *maxid = max(*maxid, id);
-       return 0;
-}
-
  void rproc_free_vring(struct rproc_vring *rvring)
  {
         int size = PAGE_ALIGN(vring_size(rvring->len, rvring->align));
         struct rproc *rproc = rvring->rvdev->rproc;
-       int maxid = 0;
+       int idx = rvring->rvdev->vring - rvring;
+       struct fw_rsc_vdev *rsc;
  
         dma_free_coherent(rproc->dev.parent, size, rvring->va, rvring->dma);
         idr_remove(&rproc->notifyids, rvring->notifyid);
  
-       /* Find the largest remaining notifyid */
-       idr_for_each(&rproc->notifyids, rproc_max_notifyid, &maxid);
-       rproc->max_notifyid = maxid;
+       /* reset resource entry info */
+       rsc = (void *)rproc->table_ptr + rvring->rvdev->rsc_offset;
+       rsc->vring[idx].da = 0;
+       rsc->vring[idx].notifyid = -1;
  }
  
  /**
@@ -317,7 +319,7 @@ void rproc_free_vring(struct rproc_vring *rvring)
   * Returns 0 on success, or an appropriate error code otherwise
   */
  static int rproc_handle_vdev(struct rproc *rproc, struct fw_rsc_vdev *rsc,
-                                                               int avail)
+                                                       int offset, int avail)
  {
         struct device *dev = &rproc->dev;
         struct rproc_vdev *rvdev;
@@ -358,8 +360,8 @@ static int rproc_handle_vdev(struct rproc *rproc, struct fw_rsc_vdev *rsc,
                         goto free_rvdev;
         }
  
-       /* remember the device features */
-       rvdev->dfeatures = rsc->dfeatures;
+       /* remember the resource offset*/
+       rvdev->rsc_offset = offset;
  
         list_add_tail(&rvdev->node, &rproc->rvdevs);
  
@@ -394,7 +396,7 @@ free_rvdev:
   * Returns 0 on success, or an appropriate error code otherwise
   */
  static int rproc_handle_trace(struct rproc *rproc, struct fw_rsc_trace *rsc,
-                                                               int avail)
+                                                       int offset, int avail)
  {
         struct rproc_mem_entry *trace;
         struct device *dev = &rproc->dev;
@@ -476,7 +478,7 @@ static int rproc_handle_trace(struct rproc *rproc, struct fw_rsc_trace *rsc,
   * are outside those ranges.
   */
  static int rproc_handle_devmem(struct rproc *rproc, struct fw_rsc_devmem *rsc,
-                                                               int avail)
+                                                       int offset, int avail)
  {
         struct rproc_mem_entry *mapping;
         struct device *dev = &rproc->dev;
@@ -549,7 +551,9 @@ out:
   * pressure is important; it may have a substantial impact on performance.
   */
  static int rproc_handle_carveout(struct rproc *rproc,
-                               struct fw_rsc_carveout *rsc, int avail)
+                                               struct fw_rsc_carveout *rsc,
+                                               int offset, int avail)
+
  {
         struct rproc_mem_entry *carveout, *mapping;
         struct device *dev = &rproc->dev;
@@ -671,28 +675,45 @@ free_carv:
         return ret;
  }
  
+static int rproc_count_vrings(struct rproc *rproc, struct fw_rsc_vdev *rsc,
+                             int offset, int avail)
+{
+       /* Summarize the number of notification IDs */
+       rproc->max_notifyid += rsc->num_of_vrings;
+
+       return 0;
+}
+
  /*
   * A lookup table for resource handlers. The indices are defined in
   * enum fw_resource_type.
   */
-static rproc_handle_resource_t rproc_handle_rsc[] = {
+static rproc_handle_resource_t rproc_loading_handlers[RSC_LAST] = {
         [RSC_CARVEOUT] = (rproc_handle_resource_t)rproc_handle_carveout,
         [RSC_DEVMEM] = (rproc_handle_resource_t)rproc_handle_devmem,
         [RSC_TRACE] = (rproc_handle_resource_t)rproc_handle_trace,
         [RSC_VDEV] = NULL, /* VDEVs were handled upon registrarion */
  };
  
+static rproc_handle_resource_t rproc_vdev_handler[RSC_LAST] = {
+       [RSC_VDEV] = (rproc_handle_resource_t)rproc_handle_vdev,
+};
+
+static rproc_handle_resource_t rproc_count_vrings_handler[RSC_LAST] = {
+       [RSC_VDEV] = (rproc_handle_resource_t)rproc_count_vrings,
+};
+
  /* handle firmware resource entries before booting the remote processor */
-static int
-rproc_handle_boot_rsc(struct rproc *rproc, struct resource_table *table, int len)
+static int rproc_handle_resources(struct rproc *rproc, int len,
+                                 rproc_handle_resource_t handlers[RSC_LAST])
  {
         struct device *dev = &rproc->dev;
         rproc_handle_resource_t handler;
         int ret = 0, i;
  
-       for (i = 0; i < table->num; i++) {
-               int offset = table->offset[i];
-               struct fw_rsc_hdr *hdr = (void *)table + offset;
+       for (i = 0; i < rproc->table_ptr->num; i++) {
+               int offset = rproc->table_ptr->offset[i];
+               struct fw_rsc_hdr *hdr = (void *)rproc->table_ptr + offset;
                 int avail = len - offset - sizeof(*hdr);
                 void *rsc = (void *)hdr + sizeof(*hdr);
  
@@ -709,45 +730,11 @@ rproc_handle_boot_rsc(struct rproc *rproc, struct resource_table *table, int len
                         continue;
                 }
  
-               handler = rproc_handle_rsc[hdr->type];
+               handler = handlers[hdr->type];
                 if (!handler)
                         continue;
  
-               ret = handler(rproc, rsc, avail);
-               if (ret)
-                       break;
-       }
-
-       return ret;
-}
-
-/* handle firmware resource entries while registering the remote processor */
-static int
-rproc_handle_virtio_rsc(struct rproc *rproc, struct resource_table *table, int len)
-{
-       struct device *dev = &rproc->dev;
-       int ret = 0, i;
-
-       for (i = 0; i < table->num; i++) {
-               int offset = table->offset[i];
-               struct fw_rsc_hdr *hdr = (void *)table + offset;
-               int avail = len - offset - sizeof(*hdr);
-               struct fw_rsc_vdev *vrsc;
-
-               /* make sure table isn't truncated */
-               if (avail < 0) {
-                       dev_err(dev, "rsc table is truncated\n");
-                       return -EINVAL;
-               }
-
-               dev_dbg(dev, "%s: rsc type %d\n", __func__, hdr->type);
-
-               if (hdr->type != RSC_VDEV)
-                       continue;
-
-               vrsc = (struct fw_rsc_vdev *)hdr->data;
-
-               ret = rproc_handle_vdev(rproc, vrsc, avail);
+               ret = handler(rproc, rsc, offset + sizeof(*hdr), avail);
                 if (ret)
                         break;
         }
@@ -805,9 +792,12 @@ static int rproc_fw_boot(struct rproc *rproc, const struct firmware *fw)
  {
         struct device *dev = &rproc->dev;
         const char *name = rproc->firmware;
-       struct resource_table *table;
+       struct resource_table *table, *loaded_table;
         int ret, tablesz;
  
+       if (!rproc->table_ptr)
+               return -ENOMEM;
+
         ret = rproc_fw_sanity_check(rproc, fw);
         if (ret)
                 return ret;
@@ -833,8 +823,15 @@ static int rproc_fw_boot(struct rproc *rproc, const struct firmware *fw)
                 goto clean_up;
         }
  
+       /* Verify that resource table in loaded fw is unchanged */
+       if (rproc->table_csum != crc32(0, table, tablesz)) {
+               dev_err(dev, "resource checksum failed, fw changed?\n");
+               ret = -EINVAL;
+               goto clean_up;
+       }
+
         /* handle fw resources which are required to boot rproc */
-       ret = rproc_handle_boot_rsc(rproc, table, tablesz);
+       ret = rproc_handle_resources(rproc, tablesz, rproc_loading_handlers);
         if (ret) {
                 dev_err(dev, "Failed to process resources: %d\n", ret);
                 goto clean_up;
@@ -847,6 +844,19 @@ static int rproc_fw_boot(struct rproc *rproc, const struct firmware *fw)
                 goto clean_up;
         }
  
+       /*
+        * The starting device has been given the rproc->cached_table as the
+        * resource table. The address of the vring along with the other
+        * allocated resources (carveouts etc) is stored in cached_table.
+        * In order to pass this information to the remote device we must
+        * copy this information to device memory.
+        */
+       loaded_table = rproc_find_loaded_rsc_table(rproc, fw);
+       if (!loaded_table)
+               goto clean_up;
+
+       memcpy(loaded_table, rproc->cached_table, tablesz);
+
         /* power up the remote processor */
         ret = rproc->ops->start(rproc);
         if (ret) {
@@ -854,6 +864,13 @@ static int rproc_fw_boot(struct rproc *rproc, const struct firmware *fw)
                 goto clean_up;
         }
  
+       /*
+        * Update table_ptr so that all subsequent vring allocations and
+        * virtio fields manipulation update the actual loaded resource table
+        * in device memory.
+        */
+       rproc->table_ptr = loaded_table;
+
         rproc->state = RPROC_RUNNING;
  
         dev_info(dev, "remote processor %s is now up\n", rproc->name);
@@ -888,11 +905,30 @@ static void rproc_fw_config_virtio(const struct firmware *fw, void *context)
         if (!table)
                 goto out;
  
-       /* look for virtio devices and register them */
-       ret = rproc_handle_virtio_rsc(rproc, table, tablesz);
+       rproc->table_csum = crc32(0, table, tablesz);
+
+       /*
+        * Create a copy of the resource table. When a virtio device starts
+        * and calls vring_new_virtqueue() the address of the allocated vring
+        * will be stored in the cached_table. Before the device is started,
+        * cached_table will be copied into devic memory.
+        */
+       rproc->cached_table = kmalloc(tablesz, GFP_KERNEL);
+       if (!rproc->cached_table)
+               goto out;
+
+       memcpy(rproc->cached_table, table, tablesz);
+       rproc->table_ptr = rproc->cached_table;
+
+       /* count the number of notify-ids */
+       rproc->max_notifyid = -1;
+       ret = rproc_handle_resources(rproc, tablesz, rproc_count_vrings_handler);
         if (ret)
                 goto out;
  
+       /* look for virtio devices and register them */
+       ret = rproc_handle_resources(rproc, tablesz, rproc_vdev_handler);
+
  out:
         release_firmware(fw);
         /* allow rproc_del() contexts, if any, to proceed */
@@ -950,6 +986,9 @@ int rproc_trigger_recovery(struct rproc *rproc)
         /* wait until there is no more rproc users */
         wait_for_completion(&rproc->crash_comp);
  
+       /* Free the copy of the resource table */
+       kfree(rproc->cached_table);
+
         return rproc_add_virtio_devices(rproc);
  }
  
@@ -1105,6 +1144,9 @@ void rproc_shutdown(struct rproc *rproc)
  
         rproc_disable_iommu(rproc);
  
+       /* Give the next start a clean resource table */
+       rproc->table_ptr = rproc->cached_table;
+
         /* if in crash state, unlock crash handler */
         if (rproc->state == RPROC_CRASHED)
                 complete_all(&rproc->crash_comp);
@@ -1196,11 +1238,11 @@ static struct device_type rproc_type = {
   * @dev: the underlying device
   * @name: name of this remote processor
   * @ops: platform-specific handlers (mainly start/stop)
- * @firmware: name of firmware file to load
+ * @firmware: name of firmware file to load, can be NULL
   * @len: length of private data needed by the rproc driver (in bytes)
   *
   * Allocates a new remote processor handle, but does not register
- * it yet.
+ * it yet. if @firmware is NULL, a default name is used.
   *
   * This function should be used by rproc implementations during initialization
   * of the remote processor.
@@ -1219,19 +1261,39 @@ struct rproc *rproc_alloc(struct device *dev, const char *name,
                                 const char *firmware, int len)
  {
         struct rproc *rproc;
+       char *p, *template = "rproc-%s-fw";
+       int name_len = 0;
  
         if (!dev || !name || !ops)
                 return NULL;
  
-       rproc = kzalloc(sizeof(struct rproc) + len, GFP_KERNEL);
+       if (!firmware)
+               /*
+                * Make room for default firmware name (minus %s plus '\0').
+                * If the caller didn't pass in a firmware name then
+                * construct a default name.  We're already glomming 'len'
+                * bytes onto the end of the struct rproc allocation, so do
+                * a few more for the default firmware name (but only if
+                * the caller doesn't pass one).
+                */
+               name_len = strlen(name) + strlen(template) - 2 + 1;
+
+       rproc = kzalloc(sizeof(struct rproc) + len + name_len, GFP_KERNEL);
         if (!rproc) {
                 dev_err(dev, "%s: kzalloc failed\n", __func__);
                 return NULL;
         }
  
+       if (!firmware) {
+               p = (char *)rproc + sizeof(struct rproc) + len;
+               snprintf(p, name_len, template, name);
+       } else {
+               p = (char *)firmware;
+       }
+
+       rproc->firmware = p;
         rproc->name = name;
         rproc->ops = ops;
-       rproc->firmware = firmware;
         rproc->priv = &rproc[1];
  
         device_initialize(&rproc->dev);
@@ -1315,6 +1377,9 @@ int rproc_del(struct rproc *rproc)
         list_for_each_entry_safe(rvdev, tmp, &rproc->rvdevs, node)
                 rproc_remove_virtio_dev(rvdev);
  
+       /* Free the copy of the resource table */
+       kfree(rproc->cached_table);
+
         device_del(&rproc->dev);
  
         return 0;
diff --git a/drivers/remoteproc/remoteproc_elf_loader.c b/drivers/remoteproc/remoteproc_elf_loader.c

index 0d36f94ab51defdd1adea3c1d42b58366ac02f21..ce283a5b42a1e2b677133b9ee2d6284fb80625a7 100644 (file)
--- a/drivers/remoteproc/remoteproc_elf_loader.c
+++ b/drivers/remoteproc/remoteproc_elf_loader.c
@@ -208,41 +208,22 @@ rproc_elf_load_segments(struct rproc *rproc, const struct firmware *fw)
         return ret;
  }
  
-/**
- * rproc_elf_find_rsc_table() - find the resource table
- * @rproc: the rproc handle
- * @fw: the ELF firmware image
- * @tablesz: place holder for providing back the table size
- *
- * This function finds the resource table inside the remote processor's
- * firmware. It is used both upon the registration of @rproc (in order
- * to look for and register the supported virito devices), and when the
- * @rproc is booted.
- *
- * Returns the pointer to the resource table if it is found, and write its
- * size into @tablesz. If a valid table isn't found, NULL is returned
- * (and @tablesz isn't set).
- */
-static struct resource_table *
-rproc_elf_find_rsc_table(struct rproc *rproc, const struct firmware *fw,
-                                                       int *tablesz)
+static struct elf32_shdr *
+find_table(struct device *dev, struct elf32_hdr *ehdr, size_t fw_size)
  {
-       struct elf32_hdr *ehdr;
         struct elf32_shdr *shdr;
+       int i;
         const char *name_table;
-       struct device *dev = &rproc->dev;
         struct resource_table *table = NULL;
-       int i;
-       const u8 *elf_data = fw->data;
+       const u8 *elf_data = (void *)ehdr;
  
-       ehdr = (struct elf32_hdr *)elf_data;
+       /* look for the resource table and handle it */
         shdr = (struct elf32_shdr *)(elf_data + ehdr->e_shoff);
         name_table = elf_data + shdr[ehdr->e_shstrndx].sh_offset;
  
-       /* look for the resource table and handle it */
         for (i = 0; i < ehdr->e_shnum; i++, shdr++) {
-               int size = shdr->sh_size;
-               int offset = shdr->sh_offset;
+               u32 size = shdr->sh_size;
+               u32 offset = shdr->sh_offset;
  
                 if (strcmp(name_table + shdr->sh_name, ".resource_table"))
                         continue;
@@ -250,7 +231,7 @@ rproc_elf_find_rsc_table(struct rproc *rproc, const struct firmware *fw,
                 table = (struct resource_table *)(elf_data + offset);
  
                 /* make sure we have the entire table */
-               if (offset + size > fw->size) {
+               if (offset + size > fw_size || offset + size < size) {
                         dev_err(dev, "resource table truncated\n");
                         return NULL;
                 }
@@ -280,16 +261,77 @@ rproc_elf_find_rsc_table(struct rproc *rproc, const struct firmware *fw,
                         return NULL;
                 }
  
-               *tablesz = shdr->sh_size;
-               break;
+               return shdr;
         }
  
+       return NULL;
+}
+
+/**
+ * rproc_elf_find_rsc_table() - find the resource table
+ * @rproc: the rproc handle
+ * @fw: the ELF firmware image
+ * @tablesz: place holder for providing back the table size
+ *
+ * This function finds the resource table inside the remote processor's
+ * firmware. It is used both upon the registration of @rproc (in order
+ * to look for and register the supported virito devices), and when the
+ * @rproc is booted.
+ *
+ * Returns the pointer to the resource table if it is found, and write its
+ * size into @tablesz. If a valid table isn't found, NULL is returned
+ * (and @tablesz isn't set).
+ */
+static struct resource_table *
+rproc_elf_find_rsc_table(struct rproc *rproc, const struct firmware *fw,
+                        int *tablesz)
+{
+       struct elf32_hdr *ehdr;
+       struct elf32_shdr *shdr;
+       struct device *dev = &rproc->dev;
+       struct resource_table *table = NULL;
+       const u8 *elf_data = fw->data;
+
+       ehdr = (struct elf32_hdr *)elf_data;
+
+       shdr = find_table(dev, ehdr, fw->size);
+       if (!shdr)
+               return NULL;
+
+       table = (struct resource_table *)(elf_data + shdr->sh_offset);
+       *tablesz = shdr->sh_size;
+
         return table;
  }
  
+/**
+ * rproc_elf_find_loaded_rsc_table() - find the loaded resource table
+ * @rproc: the rproc handle
+ * @fw: the ELF firmware image
+ *
+ * This function finds the location of the loaded resource table. Don't
+ * call this function if the table wasn't loaded yet - it's a bug if you do.
+ *
+ * Returns the pointer to the resource table if it is found or NULL otherwise.
+ * If the table wasn't loaded yet the result is unspecified.
+ */
+static struct resource_table *
+rproc_elf_find_loaded_rsc_table(struct rproc *rproc, const struct firmware *fw)
+{
+       struct elf32_hdr *ehdr = (struct elf32_hdr *)fw->data;
+       struct elf32_shdr *shdr;
+
+       shdr = find_table(&rproc->dev, ehdr, fw->size);
+       if (!shdr)
+               return NULL;
+
+       return rproc_da_to_va(rproc, shdr->sh_addr, shdr->sh_size);
+}
+
  const struct rproc_fw_ops rproc_elf_fw_ops = {
         .load = rproc_elf_load_segments,
         .find_rsc_table = rproc_elf_find_rsc_table,
+       .find_loaded_rsc_table = rproc_elf_find_loaded_rsc_table,
         .sanity_check = rproc_elf_sanity_check,
         .get_boot_addr = rproc_elf_get_boot_addr
  };
diff --git a/drivers/remoteproc/remoteproc_internal.h b/drivers/remoteproc/remoteproc_internal.h

index 7bb66482d061d33e41cd9f4e69fb97fe0fa91655..157e762c15714d3c03778edd6ae3df8f68a31bab 100644 (file)
--- a/drivers/remoteproc/remoteproc_internal.h
+++ b/drivers/remoteproc/remoteproc_internal.h
@@ -27,7 +27,8 @@ struct rproc;
  
  /**
   * struct rproc_fw_ops - firmware format specific operations.
- * @find_rsc_table:    finds the resource table inside the firmware image
+ * @find_rsc_table:    find the resource table inside the firmware image
+ * @find_loaded_rsc_table: find the loaded resouce table
   * @load:              load firmeware to memory, where the remote processor
   *                     expects to find it
   * @sanity_check:      sanity check the fw image
@@ -37,6 +38,8 @@ struct rproc_fw_ops {
         struct resource_table *(*find_rsc_table) (struct rproc *rproc,
                                                 const struct firmware *fw,
                                                 int *tablesz);
+       struct resource_table *(*find_loaded_rsc_table)(struct rproc *rproc,
+                                               const struct firmware *fw);
         int (*load)(struct rproc *rproc, const struct firmware *fw);
         int (*sanity_check)(struct rproc *rproc, const struct firmware *fw);
         u32 (*get_boot_addr)(struct rproc *rproc, const struct firmware *fw);
@@ -102,6 +105,16 @@ struct resource_table *rproc_find_rsc_table(struct rproc *rproc,
         return NULL;
  }
  
+static inline
+struct resource_table *rproc_find_loaded_rsc_table(struct rproc *rproc,
+                                const struct firmware *fw)
+{
+       if (rproc->fw_ops->find_loaded_rsc_table)
+               return rproc->fw_ops->find_loaded_rsc_table(rproc, fw);
+
+        return NULL;
+}
+
  extern const struct rproc_fw_ops rproc_elf_fw_ops;
  
  #endif /* REMOTEPROC_INTERNAL_H */
diff --git a/drivers/remoteproc/remoteproc_virtio.c b/drivers/remoteproc/remoteproc_virtio.c

index afed9b7731c456b43551e999d97f054e13b31032..b09c75c21b609c51569e4088d8aba51769bec8a4 100644 (file)
--- a/drivers/remoteproc/remoteproc_virtio.c
+++ b/drivers/remoteproc/remoteproc_virtio.c
@@ -173,25 +173,35 @@ error:
         return ret;
  }
  
-/*
- * We don't support yet real virtio status semantics.
- *
- * The plan is to provide this via the VDEV resource entry
- * which is part of the firmware: this way the remote processor
- * will be able to access the status values as set by us.
- */
  static u8 rproc_virtio_get_status(struct virtio_device *vdev)
  {
-       return 0;
+       struct rproc_vdev *rvdev = vdev_to_rvdev(vdev);
+       struct fw_rsc_vdev *rsc;
+
+       rsc = (void *)rvdev->rproc->table_ptr + rvdev->rsc_offset;
+
+       return rsc->status;
  }
  
  static void rproc_virtio_set_status(struct virtio_device *vdev, u8 status)
  {
+       struct rproc_vdev *rvdev = vdev_to_rvdev(vdev);
+       struct fw_rsc_vdev *rsc;
+
+       rsc = (void *)rvdev->rproc->table_ptr + rvdev->rsc_offset;
+
+       rsc->status = status;
         dev_dbg(&vdev->dev, "status: %d\n", status);
  }
  
  static void rproc_virtio_reset(struct virtio_device *vdev)
  {
+       struct rproc_vdev *rvdev = vdev_to_rvdev(vdev);
+       struct fw_rsc_vdev *rsc;
+
+       rsc = (void *)rvdev->rproc->table_ptr + rvdev->rsc_offset;
+
+       rsc->status = 0;
         dev_dbg(&vdev->dev, "reset !\n");
  }
  
@@ -199,13 +209,19 @@ static void rproc_virtio_reset(struct virtio_device *vdev)
  static u32 rproc_virtio_get_features(struct virtio_device *vdev)
  {
         struct rproc_vdev *rvdev = vdev_to_rvdev(vdev);
+       struct fw_rsc_vdev *rsc;
+
+       rsc = (void *)rvdev->rproc->table_ptr + rvdev->rsc_offset;
  
-       return rvdev->dfeatures;
+       return rsc->dfeatures;
  }
  
  static void rproc_virtio_finalize_features(struct virtio_device *vdev)
  {
         struct rproc_vdev *rvdev = vdev_to_rvdev(vdev);
+       struct fw_rsc_vdev *rsc;
+
+       rsc = (void *)rvdev->rproc->table_ptr + rvdev->rsc_offset;
  
         /* Give virtio_ring a chance to accept features */
         vring_transport_features(vdev);
@@ -213,13 +229,44 @@ static void rproc_virtio_finalize_features(struct virtio_device *vdev)
         /*
          * Remember the finalized features of our vdev, and provide it
          * to the remote processor once it is powered on.
-        *
-        * Similarly to the status field, we don't expose yet the negotiated
-        * features to the remote processors at this point. This will be
-        * fixed as part of a small resource table overhaul and then an
-        * extension of the virtio resource entries.
          */
-       rvdev->gfeatures = vdev->features[0];
+       rsc->gfeatures = vdev->features[0];
+}
+
+static void rproc_virtio_get(struct virtio_device *vdev, unsigned offset,
+                                                       void *buf, unsigned len)
+{
+       struct rproc_vdev *rvdev = vdev_to_rvdev(vdev);
+       struct fw_rsc_vdev *rsc;
+       void *cfg;
+
+       rsc = (void *)rvdev->rproc->table_ptr + rvdev->rsc_offset;
+       cfg = &rsc->vring[rsc->num_of_vrings];
+
+       if (offset + len > rsc->config_len || offset + len < len) {
+               dev_err(&vdev->dev, "rproc_virtio_get: access out of bounds\n");
+               return;
+       }
+
+       memcpy(buf, cfg + offset, len);
+}
+
+static void rproc_virtio_set(struct virtio_device *vdev, unsigned offset,
+                     const void *buf, unsigned len)
+{
+       struct rproc_vdev *rvdev = vdev_to_rvdev(vdev);
+       struct fw_rsc_vdev *rsc;
+       void *cfg;
+
+       rsc = (void *)rvdev->rproc->table_ptr + rvdev->rsc_offset;
+       cfg = &rsc->vring[rsc->num_of_vrings];
+
+       if (offset + len > rsc->config_len || offset + len < len) {
+               dev_err(&vdev->dev, "rproc_virtio_set: access out of bounds\n");
+               return;
+       }
+
+       memcpy(cfg + offset, buf, len);
  }
  
  static const struct virtio_config_ops rproc_virtio_config_ops = {
@@ -230,6 +277,8 @@ static const struct virtio_config_ops rproc_virtio_config_ops = {
         .reset          = rproc_virtio_reset,
         .set_status     = rproc_virtio_set_status,
         .get_status     = rproc_virtio_get_status,
+       .get            = rproc_virtio_get,
+       .set            = rproc_virtio_set,
  };
  
  /*
diff --git a/drivers/remoteproc/ste_modem_rproc.c b/drivers/remoteproc/ste_modem_rproc.c

index fb95c4220052decb09117075d3a798dbbb6235b7..1ec39a4c0b3edbd9a6c64ef95e22649ab6ca7de7 100644 (file)
--- a/drivers/remoteproc/ste_modem_rproc.c
+++ b/drivers/remoteproc/ste_modem_rproc.c
@@ -64,26 +64,18 @@ static int sproc_load_segments(struct rproc *rproc, const struct firmware *fw)
  }
  
  /* Find the entry for resource table in the Table of Content */
-static struct ste_toc_entry *sproc_find_rsc_entry(const struct firmware *fw)
+static const struct ste_toc_entry *sproc_find_rsc_entry(const void *data)
  {
         int i;
-       struct ste_toc *toc;
-
-       if (!fw)
-               return NULL;
-
-       toc = (void *)fw->data;
+       const struct ste_toc *toc;
+       toc = data;
  
         /* Search the table for the resource table */
         for (i = 0; i < SPROC_MAX_TOC_ENTRIES &&
                     toc->table[i].start != 0xffffffff; i++) {
-
                 if (!strncmp(toc->table[i].name, SPROC_RESOURCE_NAME,
-                            sizeof(toc->table[i].name))) {
-                       if (toc->table[i].start > fw->size)
-                               return NULL;
+                            sizeof(toc->table[i].name)))
                         return &toc->table[i];
-               }
         }
  
         return NULL;
@@ -96,9 +88,12 @@ sproc_find_rsc_table(struct rproc *rproc, const struct firmware *fw,
  {
         struct sproc *sproc = rproc->priv;
         struct resource_table *table;
-       struct ste_toc_entry *entry;
+       const struct ste_toc_entry *entry;
  
-       entry = sproc_find_rsc_entry(fw);
+       if (!fw)
+               return NULL;
+
+       entry = sproc_find_rsc_entry(fw->data);
         if (!entry) {
                 sproc_err(sproc, "resource table not found in fw\n");
                 return NULL;
@@ -149,10 +144,30 @@ sproc_find_rsc_table(struct rproc *rproc, const struct firmware *fw,
         return table;
  }
  
+/* Find the resource table inside the remote processor's firmware. */
+static struct resource_table *
+sproc_find_loaded_rsc_table(struct rproc *rproc, const struct firmware *fw)
+{
+       struct sproc *sproc = rproc->priv;
+       const struct ste_toc_entry *entry;
+
+       if (!fw || !sproc->fw_addr)
+               return NULL;
+
+       entry = sproc_find_rsc_entry(sproc->fw_addr);
+       if (!entry) {
+               sproc_err(sproc, "resource table not found in fw\n");
+               return NULL;
+       }
+
+       return sproc->fw_addr + entry->start;
+}
+
  /* STE modem firmware handler operations */
  const struct rproc_fw_ops sproc_fw_ops = {
         .load = sproc_load_segments,
         .find_rsc_table = sproc_find_rsc_table,
+       .find_loaded_rsc_table = sproc_find_loaded_rsc_table,
  };
  
  /* Kick the modem with specified notification id */
@@ -198,7 +213,7 @@ static int sproc_start(struct rproc *rproc)
         }
  
         /* Subscribe to notifications */
-       for (i = 0; i < rproc->max_notifyid; i++) {
+       for (i = 0; i <= rproc->max_notifyid; i++) {
                 err = sproc->mdev->ops.kick_subscribe(sproc->mdev, i);
                 if (err) {
                         sproc_err(sproc,
diff --git a/drivers/rpmsg/Kconfig b/drivers/rpmsg/Kconfig

index f6e0ea6ffda5057e0417295a67d4fab3e0248975..69a219387582bf3391f6ecd7075e037769d0de9c 100644 (file)
--- a/drivers/rpmsg/Kconfig
+++ b/drivers/rpmsg/Kconfig
@@ -4,5 +4,6 @@ menu "Rpmsg drivers"
  config RPMSG
         tristate
         select VIRTIO
+       select VIRTUALIZATION
  
  endmenu
diff --git a/drivers/rpmsg/virtio_rpmsg_bus.c b/drivers/rpmsg/virtio_rpmsg_bus.c

index 56fceafec9ec01e615654a18588208227a249f24..b6135d4d54eb8f6c99dc677c988cc6a95fc2bda7 100644 (file)
--- a/drivers/rpmsg/virtio_rpmsg_bus.c
+++ b/drivers/rpmsg/virtio_rpmsg_bus.c
@@ -776,23 +776,13 @@ out:
  }
  EXPORT_SYMBOL(rpmsg_send_offchannel_raw);
  
-/* called when an rx buffer is used, and it's time to digest a message */
-static void rpmsg_recv_done(struct virtqueue *rvq)
+static int rpmsg_recv_single(struct virtproc_info *vrp, struct device *dev,
+                            struct rpmsg_hdr *msg, unsigned int len)
  {
-       struct rpmsg_hdr *msg;
-       unsigned int len;
         struct rpmsg_endpoint *ept;
         struct scatterlist sg;
-       struct virtproc_info *vrp = rvq->vdev->priv;
-       struct device *dev = &rvq->vdev->dev;
         int err;
  
-       msg = virtqueue_get_buf(rvq, &len);
-       if (!msg) {
-               dev_err(dev, "uhm, incoming signal, but no used buffer ?\n");
-               return;
-       }
-
         dev_dbg(dev, "From: 0x%x, To: 0x%x, Len: %d, Flags: %d, Reserved: %d\n",
                                         msg->src, msg->dst, msg->len,
                                         msg->flags, msg->reserved);
@@ -806,7 +796,7 @@ static void rpmsg_recv_done(struct virtqueue *rvq)
         if (len > RPMSG_BUF_SIZE ||
                 msg->len > (len - sizeof(struct rpmsg_hdr))) {
                 dev_warn(dev, "inbound msg too big: (%d, %d)\n", len, msg->len);
-               return;
+               return -EINVAL;
         }
  
         /* use the dst addr to fetch the callback of the appropriate user */
@@ -842,11 +832,42 @@ static void rpmsg_recv_done(struct virtqueue *rvq)
         err = virtqueue_add_inbuf(vrp->rvq, &sg, 1, msg, GFP_KERNEL);
         if (err < 0) {
                 dev_err(dev, "failed to add a virtqueue buffer: %d\n", err);
+               return err;
+       }
+
+       return 0;
+}
+
+/* called when an rx buffer is used, and it's time to digest a message */
+static void rpmsg_recv_done(struct virtqueue *rvq)
+{
+       struct virtproc_info *vrp = rvq->vdev->priv;
+       struct device *dev = &rvq->vdev->dev;
+       struct rpmsg_hdr *msg;
+       unsigned int len, msgs_received = 0;
+       int err;
+
+       msg = virtqueue_get_buf(rvq, &len);
+       if (!msg) {
+               dev_err(dev, "uhm, incoming signal, but no used buffer ?\n");
                 return;
         }
  
+       while (msg) {
+               err = rpmsg_recv_single(vrp, dev, msg, len);
+               if (err)
+                       break;
+
+               msgs_received++;
+
+               msg = virtqueue_get_buf(rvq, &len);
+       };
+
+       dev_dbg(dev, "Received %u messages\n", msgs_received);
+
         /* tell the remote processor we added another available rx buffer */
-       virtqueue_kick(vrp->rvq);
+       if (msgs_received)
+               virtqueue_kick(vrp->rvq);
  }
  
  /*
diff --git a/drivers/rtc/rtc-rs5c372.c b/drivers/rtc/rtc-rs5c372.c

index 224d634322b4f0abedf40d31b039ce9bb491a732..ccf54f06396bf251bf42be6007c40db21b84a5c6 100644 (file)
--- a/drivers/rtc/rtc-rs5c372.c
+++ b/drivers/rtc/rtc-rs5c372.c
@@ -68,6 +68,7 @@
  enum rtc_type {
         rtc_undef = 0,
         rtc_r2025sd,
+       rtc_r2221tl,
         rtc_rs5c372a,
         rtc_rs5c372b,
         rtc_rv5c386,
@@ -76,6 +77,7 @@ enum rtc_type {
  
  static const struct i2c_device_id rs5c372_id[] = {
         { "r2025sd", rtc_r2025sd },
+       { "r2221tl", rtc_r2221tl },
         { "rs5c372a", rtc_rs5c372a },
         { "rs5c372b", rtc_rs5c372b },
         { "rv5c386", rtc_rv5c386 },
@@ -529,6 +531,7 @@ static int rs5c_oscillator_setup(struct rs5c372 *rs5c372)
                 rs5c372->time24 = 1;
                 break;
         case rtc_r2025sd:
+       case rtc_r2221tl:
         case rtc_rv5c386:
         case rtc_rv5c387a:
                 buf[0] |= RV5C387_CTRL1_24;
@@ -609,6 +612,7 @@ static int rs5c372_probe(struct i2c_client *client,
                         rs5c372->time24 = 1;
                 break;
         case rtc_r2025sd:
+       case rtc_r2221tl:
         case rtc_rv5c386:
         case rtc_rv5c387a:
                 if (rs5c372->regs[RS5C_REG_CTRL1] & RV5C387_CTRL1_24)
@@ -640,6 +644,7 @@ static int rs5c372_probe(struct i2c_client *client,
         dev_info(&client->dev, "%s found, %s, driver version " DRV_VERSION "\n",
                         ({ char *s; switch (rs5c372->type) {
                         case rtc_r2025sd:       s = "r2025sd"; break;
+                       case rtc_r2221tl:       s = "r2221tl"; break;
                         case rtc_rs5c372a:      s = "rs5c372a"; break;
                         case rtc_rs5c372b:      s = "rs5c372b"; break;
                         case rtc_rv5c386:       s = "rv5c386"; break;
diff --git a/drivers/s390/block/dasd.c b/drivers/s390/block/dasd.c

index 82758cbb220be6a6bc2f27396c4063c6272ffeff..4361d9772c42ad8482e3ddbab2c7edf36294b579 100644 (file)
--- a/drivers/s390/block/dasd.c
+++ b/drivers/s390/block/dasd.c
@@ -2997,18 +2997,14 @@ unlock:
         return rc;
  }
  
-static int dasd_release(struct gendisk *disk, fmode_t mode)
+static void dasd_release(struct gendisk *disk, fmode_t mode)
  {
-       struct dasd_device *base;
-
-       base = dasd_device_from_gendisk(disk);
-       if (!base)
-               return -ENODEV;
-
-       atomic_dec(&base->block->open_count);
-       module_put(base->discipline->owner);
-       dasd_put_device(base);
-       return 0;
+       struct dasd_device *base = dasd_device_from_gendisk(disk);
+       if (base) {
+               atomic_dec(&base->block->open_count);
+               module_put(base->discipline->owner);
+               dasd_put_device(base);
+       }
  }
  
  /*
diff --git a/drivers/s390/block/dcssblk.c b/drivers/s390/block/dcssblk.c

index b6ad0de07930c531f9a81f8314a1b6b974e9d6db..6eca019bcf30a50edfab1a80daf1b351d2320474 100644 (file)
--- a/drivers/s390/block/dcssblk.c
+++ b/drivers/s390/block/dcssblk.c
@@ -26,7 +26,7 @@
  #define DCSS_BUS_ID_SIZE 20
  
  static int dcssblk_open(struct block_device *bdev, fmode_t mode);
-static int dcssblk_release(struct gendisk *disk, fmode_t mode);
+static void dcssblk_release(struct gendisk *disk, fmode_t mode);
  static void dcssblk_make_request(struct request_queue *q, struct bio *bio);
  static int dcssblk_direct_access(struct block_device *bdev, sector_t secnum,
                                  void **kaddr, unsigned long *pfn);
@@ -781,16 +781,15 @@ out:
         return rc;
  }
  
-static int
+static void
  dcssblk_release(struct gendisk *disk, fmode_t mode)
  {
         struct dcssblk_dev_info *dev_info = disk->private_data;
         struct segment_info *entry;
-       int rc;
  
         if (!dev_info) {
-               rc = -ENODEV;
-               goto out;
+               WARN_ON(1);
+               return;
         }
         down_write(&dcssblk_devices_sem);
         if (atomic_dec_and_test(&dev_info->use_count)
@@ -803,9 +802,6 @@ dcssblk_release(struct gendisk *disk, fmode_t mode)
                 dev_info->save_pending = 0;
         }
         up_write(&dcssblk_devices_sem);
-       rc = 0;
-out:
-       return rc;
  }
  
  static void
@@ -826,8 +822,7 @@ dcssblk_make_request(struct request_queue *q, struct bio *bio)
         if ((bio->bi_sector & 7) != 0 || (bio->bi_size & 4095) != 0)
                 /* Request is not page-aligned. */
                 goto fail;
-       if (((bio->bi_size >> 9) + bio->bi_sector)
-                       > get_capacity(bio->bi_bdev->bd_disk)) {
+       if (bio_end_sector(bio) > get_capacity(bio->bi_bdev->bd_disk)) {
                 /* Request beyond end of DCSS segment. */
                 goto fail;
         }
diff --git a/drivers/s390/block/scm_blk.c b/drivers/s390/block/scm_blk.c

index b303cab76a7f3787149837fdd3223b6d04c7d7bb..5d73e6e49af6c96cb9e97b60163485b1ba5dbb8b 100644 (file)
--- a/drivers/s390/block/scm_blk.c
+++ b/drivers/s390/block/scm_blk.c
@@ -123,10 +123,9 @@ static int scm_open(struct block_device *blkdev, fmode_t mode)
         return scm_get_ref();
  }
  
-static int scm_release(struct gendisk *gendisk, fmode_t mode)
+static void scm_release(struct gendisk *gendisk, fmode_t mode)
  {
         scm_put_ref();
-       return 0;
  }
  
  static const struct block_device_operations scm_blk_devops = {
diff --git a/drivers/scsi/libsas/sas_expander.c b/drivers/scsi/libsas/sas_expander.c

index 55cbd018015997bba484bfe5ffdbacabcc7e43db..f42b0e15410f8a52e0a570664b04547d459e7aec 100644 (file)
--- a/drivers/scsi/libsas/sas_expander.c
+++ b/drivers/scsi/libsas/sas_expander.c
@@ -2163,10 +2163,10 @@ int sas_smp_handler(struct Scsi_Host *shost, struct sas_rphy *rphy,
         }
  
         /* do we need to support multiple segments? */
-       if (req->bio->bi_vcnt > 1 || rsp->bio->bi_vcnt > 1) {
+       if (bio_segments(req->bio) > 1 || bio_segments(rsp->bio) > 1) {
                 printk("%s: multiple segments req %u %u, rsp %u %u\n",
-                      __func__, req->bio->bi_vcnt, blk_rq_bytes(req),
-                      rsp->bio->bi_vcnt, blk_rq_bytes(rsp));
+                      __func__, bio_segments(req->bio), blk_rq_bytes(req),
+                      bio_segments(rsp->bio), blk_rq_bytes(rsp));
                 return -EINVAL;
         }
  
diff --git a/drivers/scsi/mpt2sas/mpt2sas_transport.c b/drivers/scsi/mpt2sas/mpt2sas_transport.c

index 8c2ffbe6af0f1dbd67e724507101eabe5e0ceca6..193e7ae90c3bd754b7a50a01db5fda73b2822502 100644 (file)
--- a/drivers/scsi/mpt2sas/mpt2sas_transport.c
+++ b/drivers/scsi/mpt2sas/mpt2sas_transport.c
@@ -1939,7 +1939,7 @@ _transport_smp_handler(struct Scsi_Host *shost, struct sas_rphy *rphy,
         ioc->transport_cmds.status = MPT2_CMD_PENDING;
  
         /* Check if the request is split across multiple segments */
-       if (req->bio->bi_vcnt > 1) {
+       if (bio_segments(req->bio) > 1) {
                 u32 offset = 0;
  
                 /* Allocate memory and copy the request */
@@ -1971,7 +1971,7 @@ _transport_smp_handler(struct Scsi_Host *shost, struct sas_rphy *rphy,
  
         /* Check if the response needs to be populated across
          * multiple segments */
-       if (rsp->bio->bi_vcnt > 1) {
+       if (bio_segments(rsp->bio) > 1) {
                 pci_addr_in = pci_alloc_consistent(ioc->pdev, blk_rq_bytes(rsp),
                     &pci_dma_in);
                 if (!pci_addr_in) {
@@ -2038,7 +2038,7 @@ _transport_smp_handler(struct Scsi_Host *shost, struct sas_rphy *rphy,
         sgl_flags = (MPI2_SGE_FLAGS_SIMPLE_ELEMENT |
             MPI2_SGE_FLAGS_END_OF_BUFFER | MPI2_SGE_FLAGS_HOST_TO_IOC);
         sgl_flags = sgl_flags << MPI2_SGE_FLAGS_SHIFT;
-       if (req->bio->bi_vcnt > 1) {
+       if (bio_segments(req->bio) > 1) {
                 ioc->base_add_sg_single(psge, sgl_flags |
                     (blk_rq_bytes(req) - 4), pci_dma_out);
         } else {
@@ -2054,7 +2054,7 @@ _transport_smp_handler(struct Scsi_Host *shost, struct sas_rphy *rphy,
             MPI2_SGE_FLAGS_LAST_ELEMENT | MPI2_SGE_FLAGS_END_OF_BUFFER |
             MPI2_SGE_FLAGS_END_OF_LIST);
         sgl_flags = sgl_flags << MPI2_SGE_FLAGS_SHIFT;
-       if (rsp->bio->bi_vcnt > 1) {
+       if (bio_segments(rsp->bio) > 1) {
                 ioc->base_add_sg_single(psge, sgl_flags |
                     (blk_rq_bytes(rsp) + 4), pci_dma_in);
         } else {
@@ -2099,7 +2099,7 @@ _transport_smp_handler(struct Scsi_Host *shost, struct sas_rphy *rphy,
                     le16_to_cpu(mpi_reply->ResponseDataLength);
                 /* check if the resp needs to be copied from the allocated
                  * pci mem */
-               if (rsp->bio->bi_vcnt > 1) {
+               if (bio_segments(rsp->bio) > 1) {
                         u32 offset = 0;
                         u32 bytes_to_copy =
                             le16_to_cpu(mpi_reply->ResponseDataLength);
diff --git a/drivers/scsi/sd.c b/drivers/scsi/sd.c

index 7992635d405fbe91cd1029e8cce15af7e60becc0..e6689776b4f617ac55b1e9c4dac138cee85a9196 100644 (file)
--- a/drivers/scsi/sd.c
+++ b/drivers/scsi/sd.c
@@ -1188,7 +1188,7 @@ error_autopm:
   *
   *     Locking: called with bdev->bd_mutex held.
   **/
-static int sd_release(struct gendisk *disk, fmode_t mode)
+static void sd_release(struct gendisk *disk, fmode_t mode)
  {
         struct scsi_disk *sdkp = scsi_disk(disk);
         struct scsi_device *sdev = sdkp->device;
@@ -1207,7 +1207,6 @@ static int sd_release(struct gendisk *disk, fmode_t mode)
  
         scsi_autopm_put_device(sdev);
         scsi_disk_put(sdkp);
-       return 0;
  }
  
  static int sd_getgeo(struct block_device *bdev, struct hd_geometry *geo)
diff --git a/drivers/scsi/sg.c b/drivers/scsi/sg.c

index 9f0c46547459c4ddb1a71c41f769fe2761034983..df5e961484e108312f6f01765882dd4319d584f7 100644 (file)
--- a/drivers/scsi/sg.c
+++ b/drivers/scsi/sg.c
@@ -35,6 +35,7 @@ static int sg_version_num = 30534;    /* 2 digits for each component */
  #include <linux/sched.h>
  #include <linux/string.h>
  #include <linux/mm.h>
+#include <linux/aio.h>
  #include <linux/errno.h>
  #include <linux/mtio.h>
  #include <linux/ioctl.h>
diff --git a/drivers/scsi/sr.c b/drivers/scsi/sr.c

index f2884ee90710159f1e28e322749c9886aacca21f..119d67f9c47edef7517e120f510169df4b105316 100644 (file)
--- a/drivers/scsi/sr.c
+++ b/drivers/scsi/sr.c
@@ -541,14 +541,13 @@ static int sr_block_open(struct block_device *bdev, fmode_t mode)
         return ret;
  }
  
-static int sr_block_release(struct gendisk *disk, fmode_t mode)
+static void sr_block_release(struct gendisk *disk, fmode_t mode)
  {
         struct scsi_cd *cd = scsi_cd(disk);
         mutex_lock(&sr_mutex);
         cdrom_release(&cd->cdi, mode);
         scsi_cd_put(cd);
         mutex_unlock(&sr_mutex);
-       return 0;
  }
  
  static int sr_block_ioctl(struct block_device *bdev, fmode_t mode, unsigned cmd,
diff --git a/drivers/spi/Kconfig b/drivers/spi/Kconfig

index 141d8c10b7645bb29f0d2c1dd03c2d067d3577e1..92a9345d7a6bdeaa26fc8ca64a72a9d3a4d0554e 100644 (file)
--- a/drivers/spi/Kconfig
+++ b/drivers/spi/Kconfig
@@ -62,7 +62,7 @@ config SPI_ALTERA
  
  config SPI_ATH79
         tristate "Atheros AR71XX/AR724X/AR913X SPI controller driver"
-       depends on ATH79 && GENERIC_GPIO
+       depends on ATH79 && GPIOLIB
         select SPI_BITBANG
         help
           This enables support for the SPI controller present on the
@@ -175,7 +175,7 @@ config SPI_FALCON
  
  config SPI_GPIO
         tristate "GPIO-based bitbanging SPI Master"
-       depends on GENERIC_GPIO
+       depends on GPIOLIB
         select SPI_BITBANG
         help
           This simple GPIO bitbanging SPI master uses the arch-neutral GPIO
@@ -259,7 +259,7 @@ config SPI_FSL_ESPI
  
  config SPI_OC_TINY
         tristate "OpenCores tiny SPI"
-       depends on GENERIC_GPIO
+       depends on GPIOLIB
         select SPI_BITBANG
         help
           This is the driver for OpenCores tiny SPI master controller.
@@ -457,7 +457,7 @@ config SPI_TOPCLIFF_PCH
  
  config SPI_TXX9
         tristate "Toshiba TXx9 SPI controller"
-       depends on GENERIC_GPIO && CPU_TX49XX
+       depends on GPIOLIB && CPU_TX49XX
         help
           SPI driver for Toshiba TXx9 MIPS SoCs
  
diff --git a/drivers/ssb/driver_mipscore.c b/drivers/ssb/driver_mipscore.c

index fa385a368a561219b3c510524bffd0325ea1446d..09077067b0c858d5e13ab74819c9985b1429c0ca 100644 (file)
--- a/drivers/ssb/driver_mipscore.c
+++ b/drivers/ssb/driver_mipscore.c
@@ -18,7 +18,7 @@
  
  #include "ssb_private.h"
  
-static const char *part_probes[] = { "bcm47xxpart", NULL };
+static const char * const part_probes[] = { "bcm47xxpart", NULL };
  
  static struct physmap_flash_data ssb_pflash_data = {
         .part_probe_types       = part_probes,
diff --git a/drivers/staging/android/Kconfig b/drivers/staging/android/Kconfig

index 9f61d46da157ed031c67a102ba6584fbfb022d05..c0c95be0f969de825119ce2a6ee535fd6668a63b 100644 (file)
--- a/drivers/staging/android/Kconfig
+++ b/drivers/staging/android/Kconfig
@@ -54,7 +54,7 @@ config ANDROID_TIMED_OUTPUT
  
  config ANDROID_TIMED_GPIO
         tristate "Android timed gpio driver"
-       depends on GENERIC_GPIO && ANDROID_TIMED_OUTPUT
+       depends on GPIOLIB && ANDROID_TIMED_OUTPUT
         default n
  
  config ANDROID_LOW_MEMORY_KILLER
diff --git a/drivers/staging/android/logger.c b/drivers/staging/android/logger.c

index b14a55742559986781331e2bdd10cd7711b01d5f..b040200a5a5502d7a110e9064d25b10600196065 100644 (file)
--- a/drivers/staging/android/logger.c
+++ b/drivers/staging/android/logger.c
@@ -28,6 +28,7 @@
  #include <linux/slab.h>
  #include <linux/time.h>
  #include <linux/vmalloc.h>
+#include <linux/aio.h>
  #include "logger.h"
  
  #include <asm/ioctls.h>
diff --git a/drivers/staging/iio/accel/Kconfig b/drivers/staging/iio/accel/Kconfig

index e2e786dc9c7ba7ef1908d738e25354d8fa18a9b9..ad45dfbdf4172e51f9c6ff62d1304f3827dd3538 100644 (file)
--- a/drivers/staging/iio/accel/Kconfig
+++ b/drivers/staging/iio/accel/Kconfig
@@ -61,7 +61,7 @@ config LIS3L02DQ
         depends on SPI
         select IIO_TRIGGER if IIO_BUFFER
         depends on !IIO_BUFFER || IIO_KFIFO_BUF
-       depends on GENERIC_GPIO
+       depends on GPIOLIB
         help
           Say yes here to build SPI support for the ST microelectronics
           accelerometer. The driver supplies direct access via sysfs files
diff --git a/drivers/staging/iio/adc/Kconfig b/drivers/staging/iio/adc/Kconfig

index d990829008ffc2c45a163b08212e272f16bd21a5..cabc7a367db50833f6e40dc55ebec4efb9cb57bb 100644 (file)
--- a/drivers/staging/iio/adc/Kconfig
+++ b/drivers/staging/iio/adc/Kconfig
@@ -73,7 +73,7 @@ config AD7780
  config AD7816
         tristate "Analog Devices AD7816/7/8 temperature sensor and ADC driver"
         depends on SPI
-       depends on GENERIC_GPIO
+       depends on GPIOLIB
         help
           Say yes here to build support for Analog Devices AD7816/7/8
           temperature sensors and ADC.
diff --git a/drivers/staging/iio/addac/Kconfig b/drivers/staging/iio/addac/Kconfig

index 698a8970b372f7991f5f36b3ab619180a5b1f89d..e6795e0bed1d47a6c6775ff5b2e0b5b0a0d9fd43 100644 (file)
--- a/drivers/staging/iio/addac/Kconfig
+++ b/drivers/staging/iio/addac/Kconfig
@@ -5,7 +5,7 @@ menu "Analog digital bi-direction converters"
  
  config ADT7316
         tristate "Analog Devices ADT7316/7/8 ADT7516/7/9 temperature sensor, ADC and DAC driver"
-       depends on GENERIC_GPIO
+       depends on GPIOLIB
         help
           Say yes here to build support for Analog Devices ADT7316, ADT7317, ADT7318
           and ADT7516, ADT7517, ADT7519 temperature sensors, ADC and DAC.
diff --git a/drivers/staging/iio/resolver/Kconfig b/drivers/staging/iio/resolver/Kconfig

index 49f69ef986fcc3ec1cb863cba431feae0365b0d2..ce360f16321602ca3727998697b26f62b446c952 100644 (file)
--- a/drivers/staging/iio/resolver/Kconfig
+++ b/drivers/staging/iio/resolver/Kconfig
@@ -13,7 +13,7 @@ config AD2S90
  config AD2S1200
         tristate "Analog Devices ad2s1200/ad2s1205 driver"
         depends on SPI
-       depends on GENERIC_GPIO
+       depends on GPIOLIB
         help
           Say yes here to build support for Analog Devices spi resolver
           to digital converters, ad2s1200 and ad2s1205, provides direct access
@@ -22,7 +22,7 @@ config AD2S1200
  config AD2S1210
         tristate "Analog Devices ad2s1210 driver"
         depends on SPI
-       depends on GENERIC_GPIO
+       depends on GPIOLIB
         help
           Say yes here to build support for Analog Devices spi resolver
           to digital converters, ad2s1210, provides direct access via sysfs.
diff --git a/drivers/staging/iio/trigger/Kconfig b/drivers/staging/iio/trigger/Kconfig

index d44d3ad26fa517527f3d1369e28e221b40bc72f3..1a051da62505c498289340aea1ee498589b0eab1 100644 (file)
--- a/drivers/staging/iio/trigger/Kconfig
+++ b/drivers/staging/iio/trigger/Kconfig
@@ -14,7 +14,7 @@ config IIO_PERIODIC_RTC_TRIGGER
  
  config IIO_GPIO_TRIGGER
         tristate "GPIO trigger"
-       depends on GENERIC_GPIO
+       depends on GPIOLIB
         help
           Provides support for using GPIO pins as IIO triggers.
  
diff --git a/drivers/thermal/Kconfig b/drivers/thermal/Kconfig

index a764f165b58930da5044c208693a86a817bab93f..5e3c02554d99574f7fb8d62b130235789b1ab66d 100644 (file)
--- a/drivers/thermal/Kconfig
+++ b/drivers/thermal/Kconfig
@@ -67,15 +67,16 @@ config THERMAL_GOV_USER_SPACE
           Enable this to let the user space manage the platform thermals.
  
  config CPU_THERMAL
-       tristate "generic cpu cooling support"
+       bool "generic cpu cooling support"
         depends on CPU_FREQ
         select CPU_FREQ_TABLE
         help
           This implements the generic cpu cooling mechanism through frequency
-         reduction, cpu hotplug and any other ways of reducing temperature. An
-         ACPI version of this already exists(drivers/acpi/processor_thermal.c).
+         reduction. An ACPI version of this already exists
+         (drivers/acpi/processor_thermal.c).
           This will be useful for platforms using the generic thermal interface
           and not the ACPI interface.
+
           If you want this support, you should say Y here.
  
  config THERMAL_EMULATION
@@ -86,6 +87,10 @@ config THERMAL_EMULATION
           user can manually input temperature and test the different trip
           threshold behaviour for simulation purpose.
  
+         WARNING: Be careful while enabling this option on production systems,
+         because userland can easily disable the thermal policy by simply
+         flooding this sysfs node with low temperature values.
+
  config SPEAR_THERMAL
         bool "SPEAr thermal sensor driver"
         depends on PLAT_SPEAR
@@ -117,15 +122,6 @@ config EXYNOS_THERMAL
           If you say yes here you get support for TMU (Thermal Management
           Unit) on SAMSUNG EXYNOS series of SoC.
  
-config EXYNOS_THERMAL_EMUL
-       bool "EXYNOS TMU emulation mode support"
-       depends on EXYNOS_THERMAL
-       help
-         Exynos 4412 and 4414 and 5 series has emulation mode on TMU.
-         Enable this option will be make sysfs node in exynos thermal platform
-         device directory to support emulation mode. With emulation mode sysfs
-         node, you can manually input temperature to TMU for simulation purpose.
-
  config DOVE_THERMAL
         tristate "Temperature sensor on Marvell Dove SoCs"
         depends on ARCH_DOVE
@@ -144,6 +140,14 @@ config DB8500_THERMAL
           created. Cooling devices can be bound to the trip points to cool this
           thermal zone if trip points reached.
  
+config ARMADA_THERMAL
+       tristate "Armada 370/XP thermal management"
+       depends on ARCH_MVEBU
+       depends on OF
+       help
+         Enable this option if you want to have support for thermal management
+         controller present in Armada 370 and Armada XP SoC.
+
  config DB8500_CPUFREQ_COOLING
         tristate "DB8500 cpufreq cooling"
         depends on ARCH_U8500
diff --git a/drivers/thermal/Makefile b/drivers/thermal/Makefile

index d3a2b38c31e86b694df71a4ccbb2b366025fb317..c054d410ac3f001e7192f63c0bf65767d55b22b3 100644 (file)
--- a/drivers/thermal/Makefile
+++ b/drivers/thermal/Makefile
@@ -3,14 +3,15 @@
  #
  
  obj-$(CONFIG_THERMAL)          += thermal_sys.o
+thermal_sys-y                  += thermal_core.o
  
  # governors
-obj-$(CONFIG_THERMAL_GOV_FAIR_SHARE)   += fair_share.o
-obj-$(CONFIG_THERMAL_GOV_STEP_WISE)    += step_wise.o
-obj-$(CONFIG_THERMAL_GOV_USER_SPACE)   += user_space.o
+thermal_sys-$(CONFIG_THERMAL_GOV_FAIR_SHARE)   += fair_share.o
+thermal_sys-$(CONFIG_THERMAL_GOV_STEP_WISE)    += step_wise.o
+thermal_sys-$(CONFIG_THERMAL_GOV_USER_SPACE)   += user_space.o
  
  # cpufreq cooling
-obj-$(CONFIG_CPU_THERMAL)      += cpu_cooling.o
+thermal_sys-$(CONFIG_CPU_THERMAL)      += cpu_cooling.o
  
  # platform thermal drivers
  obj-$(CONFIG_SPEAR_THERMAL)    += spear_thermal.o
@@ -19,6 +20,7 @@ obj-$(CONFIG_KIRKWOOD_THERMAL)  += kirkwood_thermal.o
  obj-$(CONFIG_EXYNOS_THERMAL)   += exynos_thermal.o
  obj-$(CONFIG_DOVE_THERMAL)     += dove_thermal.o
  obj-$(CONFIG_DB8500_THERMAL)   += db8500_thermal.o
+obj-$(CONFIG_ARMADA_THERMAL)   += armada_thermal.o
  obj-$(CONFIG_DB8500_CPUFREQ_COOLING)   += db8500_cpufreq_cooling.o
  obj-$(CONFIG_INTEL_POWERCLAMP) += intel_powerclamp.o
  
diff --git a/drivers/thermal/armada_thermal.c b/drivers/thermal/armada_thermal.c

new file mode 100644 (file)

index 0000000..5b4d75f
--- /dev/null
+++ b/drivers/thermal/armada_thermal.c
@@ -0,0 +1,232 @@
+/*
+ * Marvell Armada 370/XP thermal sensor driver
+ *
+ * Copyright (C) 2013 Marvell
+ *
+ * This software is licensed under the terms of the GNU General Public
+ * License version 2, as published by the Free Software Foundation, and
+ * may be copied, distributed, and modified under those terms.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ */
+#include <linux/device.h>
+#include <linux/err.h>
+#include <linux/io.h>
+#include <linux/kernel.h>
+#include <linux/of.h>
+#include <linux/module.h>
+#include <linux/delay.h>
+#include <linux/platform_device.h>
+#include <linux/of_device.h>
+#include <linux/thermal.h>
+
+#define THERMAL_VALID_OFFSET           9
+#define THERMAL_VALID_MASK             0x1
+#define THERMAL_TEMP_OFFSET            10
+#define THERMAL_TEMP_MASK              0x1ff
+
+/* Thermal Manager Control and Status Register */
+#define PMU_TDC0_SW_RST_MASK           (0x1 << 1)
+#define PMU_TM_DISABLE_OFFS            0
+#define PMU_TM_DISABLE_MASK            (0x1 << PMU_TM_DISABLE_OFFS)
+#define PMU_TDC0_REF_CAL_CNT_OFFS      11
+#define PMU_TDC0_REF_CAL_CNT_MASK      (0x1ff << PMU_TDC0_REF_CAL_CNT_OFFS)
+#define PMU_TDC0_OTF_CAL_MASK          (0x1 << 30)
+#define PMU_TDC0_START_CAL_MASK                (0x1 << 25)
+
+struct armada_thermal_ops;
+
+/* Marvell EBU Thermal Sensor Dev Structure */
+struct armada_thermal_priv {
+       void __iomem *sensor;
+       void __iomem *control;
+       struct armada_thermal_ops *ops;
+};
+
+struct armada_thermal_ops {
+       /* Initialize the sensor */
+       void (*init_sensor)(struct armada_thermal_priv *);
+
+       /* Test for a valid sensor value (optional) */
+       bool (*is_valid)(struct armada_thermal_priv *);
+};
+
+static void armadaxp_init_sensor(struct armada_thermal_priv *priv)
+{
+       unsigned long reg;
+
+       reg = readl_relaxed(priv->control);
+       reg |= PMU_TDC0_OTF_CAL_MASK;
+       writel(reg, priv->control);
+
+       /* Reference calibration value */
+       reg &= ~PMU_TDC0_REF_CAL_CNT_MASK;
+       reg |= (0xf1 << PMU_TDC0_REF_CAL_CNT_OFFS);
+       writel(reg, priv->control);
+
+       /* Reset the sensor */
+       reg = readl_relaxed(priv->control);
+       writel((reg | PMU_TDC0_SW_RST_MASK), priv->control);
+
+       writel(reg, priv->control);
+
+       /* Enable the sensor */
+       reg = readl_relaxed(priv->sensor);
+       reg &= ~PMU_TM_DISABLE_MASK;
+       writel(reg, priv->sensor);
+}
+
+static void armada370_init_sensor(struct armada_thermal_priv *priv)
+{
+       unsigned long reg;
+
+       reg = readl_relaxed(priv->control);
+       reg |= PMU_TDC0_OTF_CAL_MASK;
+       writel(reg, priv->control);
+
+       /* Reference calibration value */
+       reg &= ~PMU_TDC0_REF_CAL_CNT_MASK;
+       reg |= (0xf1 << PMU_TDC0_REF_CAL_CNT_OFFS);
+       writel(reg, priv->control);
+
+       reg &= ~PMU_TDC0_START_CAL_MASK;
+       writel(reg, priv->control);
+
+       mdelay(10);
+}
+
+static bool armada_is_valid(struct armada_thermal_priv *priv)
+{
+       unsigned long reg = readl_relaxed(priv->sensor);
+
+       return (reg >> THERMAL_VALID_OFFSET) & THERMAL_VALID_MASK;
+}
+
+static int armada_get_temp(struct thermal_zone_device *thermal,
+                         unsigned long *temp)
+{
+       struct armada_thermal_priv *priv = thermal->devdata;
+       unsigned long reg;
+
+       /* Valid check */
+       if (priv->ops->is_valid && !priv->ops->is_valid(priv)) {
+               dev_err(&thermal->device,
+                       "Temperature sensor reading not valid\n");
+               return -EIO;
+       }
+
+       reg = readl_relaxed(priv->sensor);
+       reg = (reg >> THERMAL_TEMP_OFFSET) & THERMAL_TEMP_MASK;
+       *temp = (3153000000UL - (10000000UL*reg)) / 13825;
+       return 0;
+}
+
+static struct thermal_zone_device_ops ops = {
+       .get_temp = armada_get_temp,
+};
+
+static const struct armada_thermal_ops armadaxp_ops = {
+       .init_sensor = armadaxp_init_sensor,
+};
+
+static const struct armada_thermal_ops armada370_ops = {
+       .is_valid = armada_is_valid,
+       .init_sensor = armada370_init_sensor,
+};
+
+static const struct of_device_id armada_thermal_id_table[] = {
+       {
+               .compatible = "marvell,armadaxp-thermal",
+               .data       = &armadaxp_ops,
+       },
+       {
+               .compatible = "marvell,armada370-thermal",
+               .data       = &armada370_ops,
+       },
+       {
+               /* sentinel */
+       },
+};
+MODULE_DEVICE_TABLE(of, armada_thermal_id_table);
+
+static int armada_thermal_probe(struct platform_device *pdev)
+{
+       struct thermal_zone_device *thermal;
+       const struct of_device_id *match;
+       struct armada_thermal_priv *priv;
+       struct resource *res;
+
+       match = of_match_device(armada_thermal_id_table, &pdev->dev);
+       if (!match)
+               return -ENODEV;
+
+       priv = devm_kzalloc(&pdev->dev, sizeof(*priv), GFP_KERNEL);
+       if (!priv)
+               return -ENOMEM;
+
+       res = platform_get_resource(pdev, IORESOURCE_MEM, 0);
+       if (!res) {
+               dev_err(&pdev->dev, "Failed to get platform resource\n");
+               return -ENODEV;
+       }
+
+       priv->sensor = devm_ioremap_resource(&pdev->dev, res);
+       if (IS_ERR(priv->sensor))
+               return PTR_ERR(priv->sensor);
+
+       res = platform_get_resource(pdev, IORESOURCE_MEM, 1);
+       if (!res) {
+               dev_err(&pdev->dev, "Failed to get platform resource\n");
+               return -ENODEV;
+       }
+
+       priv->control = devm_ioremap_resource(&pdev->dev, res);
+       if (IS_ERR(priv->control))
+               return PTR_ERR(priv->control);
+
+       priv->ops = (struct armada_thermal_ops *)match->data;
+       priv->ops->init_sensor(priv);
+
+       thermal = thermal_zone_device_register("armada_thermal", 0, 0,
+                                              priv, &ops, NULL, 0, 0);
+       if (IS_ERR(thermal)) {
+               dev_err(&pdev->dev,
+                       "Failed to register thermal zone device\n");
+               return PTR_ERR(thermal);
+       }
+
+       platform_set_drvdata(pdev, thermal);
+
+       return 0;
+}
+
+static int armada_thermal_exit(struct platform_device *pdev)
+{
+       struct thermal_zone_device *armada_thermal =
+               platform_get_drvdata(pdev);
+
+       thermal_zone_device_unregister(armada_thermal);
+       platform_set_drvdata(pdev, NULL);
+
+       return 0;
+}
+
+static struct platform_driver armada_thermal_driver = {
+       .probe = armada_thermal_probe,
+       .remove = armada_thermal_exit,
+       .driver = {
+               .name = "armada_thermal",
+               .owner = THIS_MODULE,
+               .of_match_table = of_match_ptr(armada_thermal_id_table),
+       },
+};
+
+module_platform_driver(armada_thermal_driver);
+
+MODULE_AUTHOR("Ezequiel Garcia <ezequiel.garcia@free-electrons.com>");
+MODULE_DESCRIPTION("Armada 370/XP thermal driver");
+MODULE_LICENSE("GPL v2");
diff --git a/drivers/thermal/cpu_cooling.c b/drivers/thermal/cpu_cooling.c

index 8dc44cbb3e09fe253546347e376c0834957efc1e..c94bf2e5de629419c8e00e29f2cf23ac4507c365 100644 (file)
--- a/drivers/thermal/cpu_cooling.c
+++ b/drivers/thermal/cpu_cooling.c
@@ -20,10 +20,8 @@
   *
   * ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
   */
-#include <linux/kernel.h>
  #include <linux/module.h>
  #include <linux/thermal.h>
-#include <linux/platform_device.h>
  #include <linux/cpufreq.h>
  #include <linux/err.h>
  #include <linux/slab.h>
@@ -31,21 +29,19 @@
  #include <linux/cpu_cooling.h>
  
  /**
- * struct cpufreq_cooling_device
+ * struct cpufreq_cooling_device - data for cooling device with cpufreq
   * @id: unique integer value corresponding to each cpufreq_cooling_device
   *     registered.
- * @cool_dev: thermal_cooling_device pointer to keep track of the the
- *     egistered cooling device.
+ * @cool_dev: thermal_cooling_device pointer to keep track of the
+ *     registered cooling device.
   * @cpufreq_state: integer value representing the current state of cpufreq
   *     cooling devices.
   * @cpufreq_val: integer value representing the absolute value of the clipped
   *     frequency.
   * @allowed_cpus: all the cpus involved for this cpufreq_cooling_device.
- * @node: list_head to link all cpufreq_cooling_device together.
   *
   * This structure is required for keeping information of each
- * cpufreq_cooling_device registered as a list whose head is represented by
- * cooling_cpufreq_list. In order to prevent corruption of this list a
+ * cpufreq_cooling_device registered. In order to prevent corruption of this a
   * mutex lock cooling_cpufreq_lock is used.
   */
  struct cpufreq_cooling_device {
@@ -54,9 +50,7 @@ struct cpufreq_cooling_device {
         unsigned int cpufreq_state;
         unsigned int cpufreq_val;
         struct cpumask allowed_cpus;
-       struct list_head node;
  };
-static LIST_HEAD(cooling_cpufreq_list);
  static DEFINE_IDR(cpufreq_idr);
  static DEFINE_MUTEX(cooling_cpufreq_lock);
  
@@ -70,6 +64,11 @@ static struct cpufreq_cooling_device *notify_device;
   * get_idr - function to get a unique id.
   * @idr: struct idr * handle used to create a id.
   * @id: int * value generated by this function.
+ *
+ * This function will populate @id with an unique
+ * id, using the idr API.
+ *
+ * Return: 0 on success, an error code on failure.
   */
  static int get_idr(struct idr *idr, int *id)
  {
@@ -81,6 +80,7 @@ static int get_idr(struct idr *idr, int *id)
         if (unlikely(ret < 0))
                 return ret;
         *id = ret;
+
         return 0;
  }
  
@@ -99,63 +99,162 @@ static void release_idr(struct idr *idr, int id)
  /* Below code defines functions to be used for cpufreq as cooling device */
  
  /**
- * is_cpufreq_valid - function to check if a cpu has frequency transition policy.
+ * is_cpufreq_valid - function to check frequency transitioning capability.
   * @cpu: cpu for which check is needed.
+ *
+ * This function will check the current state of the system if
+ * it is capable of changing the frequency for a given @cpu.
+ *
+ * Return: 0 if the system is not currently capable of changing
+ * the frequency of given cpu. !0 in case the frequency is changeable.
   */
  static int is_cpufreq_valid(int cpu)
  {
         struct cpufreq_policy policy;
+
         return !cpufreq_get_policy(&policy, cpu);
  }
  
+enum cpufreq_cooling_property {
+       GET_LEVEL,
+       GET_FREQ,
+       GET_MAXL,
+};
+
  /**
- * get_cpu_frequency - get the absolute value of frequency from level.
- * @cpu: cpu for which frequency is fetched.
- * @level: level of frequency, equals cooling state of cpu cooling device
- *     e.g level=0 --> 1st MAX FREQ, level=1 ---> 2nd MAX FREQ, .... etc
+ * get_property - fetch a property of interest for a give cpu.
+ * @cpu: cpu for which the property is required
+ * @input: query parameter
+ * @output: query return
+ * @property: type of query (frequency, level, max level)
+ *
+ * This is the common function to
+ * 1. get maximum cpu cooling states
+ * 2. translate frequency to cooling state
+ * 3. translate cooling state to frequency
+ * Note that the code may be not in good shape
+ * but it is written in this way in order to:
+ * a) reduce duplicate code as most of the code can be shared.
+ * b) make sure the logic is consistent when translating between
+ *    cooling states and frequencies.
+ *
+ * Return: 0 on success, -EINVAL when invalid parameters are passed.
   */
-static unsigned int get_cpu_frequency(unsigned int cpu, unsigned long level)
+static int get_property(unsigned int cpu, unsigned long input,
+                       unsigned int *output,
+                       enum cpufreq_cooling_property property)
  {
-       int ret = 0, i = 0;
-       unsigned long level_index;
-       bool descend = false;
+       int i, j;
+       unsigned long max_level = 0, level = 0;
+       unsigned int freq = CPUFREQ_ENTRY_INVALID;
+       int descend = -1;
         struct cpufreq_frequency_table *table =
                                         cpufreq_frequency_get_table(cpu);
+
+       if (!output)
+               return -EINVAL;
+
         if (!table)
-               return ret;
+               return -EINVAL;
  
-       while (table[i].frequency != CPUFREQ_TABLE_END) {
+       for (i = 0; table[i].frequency != CPUFREQ_TABLE_END; i++) {
+               /* ignore invalid entries */
                 if (table[i].frequency == CPUFREQ_ENTRY_INVALID)
                         continue;
  
-               /*check if table in ascending or descending order*/
-               if ((table[i + 1].frequency != CPUFREQ_TABLE_END) &&
-                       (table[i + 1].frequency < table[i].frequency)
-                       && !descend) {
-                       descend = true;
-               }
+               /* ignore duplicate entry */
+               if (freq == table[i].frequency)
+                       continue;
+
+               /* get the frequency order */
+               if (freq != CPUFREQ_ENTRY_INVALID && descend != -1)
+                       descend = !!(freq > table[i].frequency);
  
-               /*return if level matched and table in descending order*/
-               if (descend && i == level)
-                       return table[i].frequency;
-               i++;
+               freq = table[i].frequency;
+               max_level++;
         }
-       i--;
  
-       if (level > i || descend)
-               return ret;
-       level_index = i - level;
+       /* get max level */
+       if (property == GET_MAXL) {
+               *output = (unsigned int)max_level;
+               return 0;
+       }
  
-       /*Scan the table in reverse order and match the level*/
-       while (i >= 0) {
+       if (property == GET_FREQ)
+               level = descend ? input : (max_level - input - 1);
+
+       for (i = 0, j = 0; table[i].frequency != CPUFREQ_TABLE_END; i++) {
+               /* ignore invalid entry */
                 if (table[i].frequency == CPUFREQ_ENTRY_INVALID)
                         continue;
-               /*return if level matched*/
-               if (i == level_index)
-                       return table[i].frequency;
-               i--;
+
+               /* ignore duplicate entry */
+               if (freq == table[i].frequency)
+                       continue;
+
+               /* now we have a valid frequency entry */
+               freq = table[i].frequency;
+
+               if (property == GET_LEVEL && (unsigned int)input == freq) {
+                       /* get level by frequency */
+                       *output = descend ? j : (max_level - j - 1);
+                       return 0;
+               }
+               if (property == GET_FREQ && level == j) {
+                       /* get frequency by level */
+                       *output = freq;
+                       return 0;
+               }
+               j++;
         }
-       return ret;
+
+       return -EINVAL;
+}
+
+/**
+ * cpufreq_cooling_get_level - for a give cpu, return the cooling level.
+ * @cpu: cpu for which the level is required
+ * @freq: the frequency of interest
+ *
+ * This function will match the cooling level corresponding to the
+ * requested @freq and return it.
+ *
+ * Return: The matched cooling level on success or THERMAL_CSTATE_INVALID
+ * otherwise.
+ */
+unsigned long cpufreq_cooling_get_level(unsigned int cpu, unsigned int freq)
+{
+       unsigned int val;
+
+       if (get_property(cpu, (unsigned long)freq, &val, GET_LEVEL))
+               return THERMAL_CSTATE_INVALID;
+
+       return (unsigned long)val;
+}
+EXPORT_SYMBOL_GPL(cpufreq_cooling_get_level);
+
+/**
+ * get_cpu_frequency - get the absolute value of frequency from level.
+ * @cpu: cpu for which frequency is fetched.
+ * @level: cooling level
+ *
+ * This function matches cooling level with frequency. Based on a cooling level
+ * of frequency, equals cooling state of cpu cooling device, it will return
+ * the corresponding frequency.
+ *     e.g level=0 --> 1st MAX FREQ, level=1 ---> 2nd MAX FREQ, .... etc
+ *
+ * Return: 0 on error, the corresponding frequency otherwise.
+ */
+static unsigned int get_cpu_frequency(unsigned int cpu, unsigned long level)
+{
+       int ret = 0;
+       unsigned int freq;
+
+       ret = get_property(cpu, level, &freq, GET_FREQ);
+       if (ret)
+               return 0;
+
+       return freq;
  }
  
  /**
@@ -163,13 +262,19 @@ static unsigned int get_cpu_frequency(unsigned int cpu, unsigned long level)
   * @cpufreq_device: cpufreq_cooling_device pointer containing frequency
   *     clipping data.
   * @cooling_state: value of the cooling state.
+ *
+ * Function used to make sure the cpufreq layer is aware of current thermal
+ * limits. The limits are applied by updating the cpufreq policy.
+ *
+ * Return: 0 on success, an error code otherwise (-EINVAL in case wrong
+ * cooling state).
   */
  static int cpufreq_apply_cooling(struct cpufreq_cooling_device *cpufreq_device,
-                               unsigned long cooling_state)
+                                unsigned long cooling_state)
  {
         unsigned int cpuid, clip_freq;
-       struct cpumask *maskPtr = &cpufreq_device->allowed_cpus;
-       unsigned int cpu = cpumask_any(maskPtr);
+       struct cpumask *mask = &cpufreq_device->allowed_cpus;
+       unsigned int cpu = cpumask_any(mask);
  
  
         /* Check if the old cooling action is same as new cooling action */
@@ -184,7 +289,7 @@ static int cpufreq_apply_cooling(struct cpufreq_cooling_device *cpufreq_device,
         cpufreq_device->cpufreq_val = clip_freq;
         notify_device = cpufreq_device;
  
-       for_each_cpu(cpuid, maskPtr) {
+       for_each_cpu(cpuid, mask) {
                 if (is_cpufreq_valid(cpuid))
                         cpufreq_update_policy(cpuid);
         }
@@ -199,9 +304,15 @@ static int cpufreq_apply_cooling(struct cpufreq_cooling_device *cpufreq_device,
   * @nb:        struct notifier_block * with callback info.
   * @event: value showing cpufreq event for which this function invoked.
   * @data: callback-specific data
+ *
+ * Callback to highjack the notification on cpufreq policy transition.
+ * Every time there is a change in policy, we will intercept and
+ * update the cpufreq policy with thermal constraints.
+ *
+ * Return: 0 (success)
   */
  static int cpufreq_thermal_notifier(struct notifier_block *nb,
-                                       unsigned long event, void *data)
+                                   unsigned long event, void *data)
  {
         struct cpufreq_policy *policy = data;
         unsigned long max_freq = 0;
@@ -212,7 +323,7 @@ static int cpufreq_thermal_notifier(struct notifier_block *nb,
         if (cpumask_test_cpu(policy->cpu, &notify_device->allowed_cpus))
                 max_freq = notify_device->cpufreq_val;
  
-       /* Never exceed user_policy.max*/
+       /* Never exceed user_policy.max */
         if (max_freq > policy->user_policy.max)
                 max_freq = policy->user_policy.max;
  
@@ -222,50 +333,46 @@ static int cpufreq_thermal_notifier(struct notifier_block *nb,
         return 0;
  }
  
-/*
- * cpufreq cooling device callback functions are defined below
- */
+/* cpufreq cooling device callback functions are defined below */
  
  /**
   * cpufreq_get_max_state - callback function to get the max cooling state.
   * @cdev: thermal cooling device pointer.
   * @state: fill this variable with the max cooling state.
+ *
+ * Callback for the thermal cooling device to return the cpufreq
+ * max cooling state.
+ *
+ * Return: 0 on success, an error code otherwise.
   */
  static int cpufreq_get_max_state(struct thermal_cooling_device *cdev,
                                  unsigned long *state)
  {
         struct cpufreq_cooling_device *cpufreq_device = cdev->devdata;
-       struct cpumask *maskPtr = &cpufreq_device->allowed_cpus;
+       struct cpumask *mask = &cpufreq_device->allowed_cpus;
         unsigned int cpu;
-       struct cpufreq_frequency_table *table;
-       unsigned long count = 0;
-       int i = 0;
-
-       cpu = cpumask_any(maskPtr);
-       table = cpufreq_frequency_get_table(cpu);
-       if (!table) {
-               *state = 0;
-               return 0;
-       }
+       unsigned int count = 0;
+       int ret;
  
-       for (i = 0; (table[i].frequency != CPUFREQ_TABLE_END); i++) {
-               if (table[i].frequency == CPUFREQ_ENTRY_INVALID)
-                       continue;
-               count++;
-       }
+       cpu = cpumask_any(mask);
  
-       if (count > 0) {
-               *state = --count;
-               return 0;
-       }
+       ret = get_property(cpu, 0, &count, GET_MAXL);
  
-       return -EINVAL;
+       if (count > 0)
+               *state = count;
+
+       return ret;
  }
  
  /**
   * cpufreq_get_cur_state - callback function to get the current cooling state.
   * @cdev: thermal cooling device pointer.
   * @state: fill this variable with the current cooling state.
+ *
+ * Callback for the thermal cooling device to return the cpufreq
+ * current cooling state.
+ *
+ * Return: 0 on success, an error code otherwise.
   */
  static int cpufreq_get_cur_state(struct thermal_cooling_device *cdev,
                                  unsigned long *state)
@@ -273,6 +380,7 @@ static int cpufreq_get_cur_state(struct thermal_cooling_device *cdev,
         struct cpufreq_cooling_device *cpufreq_device = cdev->devdata;
  
         *state = cpufreq_device->cpufreq_state;
+
         return 0;
  }
  
@@ -280,6 +388,11 @@ static int cpufreq_get_cur_state(struct thermal_cooling_device *cdev,
   * cpufreq_set_cur_state - callback function to set the current cooling state.
   * @cdev: thermal cooling device pointer.
   * @state: set this variable to the current cooling state.
+ *
+ * Callback for the thermal cooling device to change the cpufreq
+ * current cooling state.
+ *
+ * Return: 0 on success, an error code otherwise.
   */
  static int cpufreq_set_cur_state(struct thermal_cooling_device *cdev,
                                  unsigned long state)
@@ -304,9 +417,16 @@ static struct notifier_block thermal_cpufreq_notifier_block = {
  /**
   * cpufreq_cooling_register - function to create cpufreq cooling device.
   * @clip_cpus: cpumask of cpus where the frequency constraints will happen.
+ *
+ * This interface function registers the cpufreq cooling device with the name
+ * "thermal-cpufreq-%x". This api can support multiple instances of cpufreq
+ * cooling devices.
+ *
+ * Return: a valid struct thermal_cooling_device pointer on success,
+ * on failure, it returns a corresponding ERR_PTR().
   */
-struct thermal_cooling_device *cpufreq_cooling_register(
-       const struct cpumask *clip_cpus)
+struct thermal_cooling_device *
+cpufreq_cooling_register(const struct cpumask *clip_cpus)
  {
         struct thermal_cooling_device *cool_dev;
         struct cpufreq_cooling_device *cpufreq_dev = NULL;
@@ -315,9 +435,9 @@ struct thermal_cooling_device *cpufreq_cooling_register(
         int ret = 0, i;
         struct cpufreq_policy policy;
  
-       /*Verify that all the clip cpus have same freq_min, freq_max limit*/
+       /* Verify that all the clip cpus have same freq_min, freq_max limit */
         for_each_cpu(i, clip_cpus) {
-               /*continue if cpufreq policy not found and not return error*/
+               /* continue if cpufreq policy not found and not return error */
                 if (!cpufreq_get_policy(&policy, i))
                         continue;
                 if (min == 0 && max == 0) {
@@ -325,12 +445,12 @@ struct thermal_cooling_device *cpufreq_cooling_register(
                         max = policy.cpuinfo.max_freq;
                 } else {
                         if (min != policy.cpuinfo.min_freq ||
-                               max != policy.cpuinfo.max_freq)
+                           max != policy.cpuinfo.max_freq)
                                 return ERR_PTR(-EINVAL);
                 }
         }
         cpufreq_dev = kzalloc(sizeof(struct cpufreq_cooling_device),
-                       GFP_KERNEL);
+                             GFP_KERNEL);
         if (!cpufreq_dev)
                 return ERR_PTR(-ENOMEM);
  
@@ -342,10 +462,11 @@ struct thermal_cooling_device *cpufreq_cooling_register(
                 return ERR_PTR(-EINVAL);
         }
  
-       sprintf(dev_name, "thermal-cpufreq-%d", cpufreq_dev->id);
+       snprintf(dev_name, sizeof(dev_name), "thermal-cpufreq-%d",
+                cpufreq_dev->id);
  
         cool_dev = thermal_cooling_device_register(dev_name, cpufreq_dev,
-                                               &cpufreq_cooling_ops);
+                                                  &cpufreq_cooling_ops);
         if (!cool_dev) {
                 release_idr(&cpufreq_idr, cpufreq_dev->id);
                 kfree(cpufreq_dev);
@@ -358,17 +479,20 @@ struct thermal_cooling_device *cpufreq_cooling_register(
         /* Register the notifier for first cpufreq cooling device */
         if (cpufreq_dev_count == 0)
                 cpufreq_register_notifier(&thermal_cpufreq_notifier_block,
-                                               CPUFREQ_POLICY_NOTIFIER);
+                                         CPUFREQ_POLICY_NOTIFIER);
         cpufreq_dev_count++;
  
         mutex_unlock(&cooling_cpufreq_lock);
+
         return cool_dev;
  }
-EXPORT_SYMBOL(cpufreq_cooling_register);
+EXPORT_SYMBOL_GPL(cpufreq_cooling_register);
  
  /**
   * cpufreq_cooling_unregister - function to remove cpufreq cooling device.
   * @cdev: thermal cooling device pointer.
+ *
+ * This interface function unregisters the "thermal-cpufreq-%x" cooling device.
   */
  void cpufreq_cooling_unregister(struct thermal_cooling_device *cdev)
  {
@@ -378,14 +502,13 @@ void cpufreq_cooling_unregister(struct thermal_cooling_device *cdev)
         cpufreq_dev_count--;
  
         /* Unregister the notifier for the last cpufreq cooling device */
-       if (cpufreq_dev_count == 0) {
+       if (cpufreq_dev_count == 0)
                 cpufreq_unregister_notifier(&thermal_cpufreq_notifier_block,
-                                       CPUFREQ_POLICY_NOTIFIER);
-       }
+                                           CPUFREQ_POLICY_NOTIFIER);
         mutex_unlock(&cooling_cpufreq_lock);
  
         thermal_cooling_device_unregister(cpufreq_dev->cool_dev);
         release_idr(&cpufreq_idr, cpufreq_dev->id);
         kfree(cpufreq_dev);
  }
-EXPORT_SYMBOL(cpufreq_cooling_unregister);
+EXPORT_SYMBOL_GPL(cpufreq_cooling_unregister);
diff --git a/drivers/thermal/db8500_cpufreq_cooling.c b/drivers/thermal/db8500_cpufreq_cooling.c

index 21419851fc0284a5447649580edb007ff079b471..786d19263ab0012cfb15e8b00e5dac6848352bd3 100644 (file)
--- a/drivers/thermal/db8500_cpufreq_cooling.c
+++ b/drivers/thermal/db8500_cpufreq_cooling.c
@@ -37,7 +37,7 @@ static int db8500_cpufreq_cooling_probe(struct platform_device *pdev)
         cpumask_set_cpu(0, &mask_val);
         cdev = cpufreq_cooling_register(&mask_val);
  
-       if (IS_ERR_OR_NULL(cdev)) {
+       if (IS_ERR(cdev)) {
                 dev_err(&pdev->dev, "Failed to register cooling device\n");
                 return PTR_ERR(cdev);
         }
diff --git a/drivers/thermal/db8500_thermal.c b/drivers/thermal/db8500_thermal.c

index 61ce60a35921d8ca543bcc275a5b7d479b98fa4f..1e3b3bf9f993be5d1e3eee1404091883dd21751f 100644 (file)
--- a/drivers/thermal/db8500_thermal.c
+++ b/drivers/thermal/db8500_thermal.c
@@ -419,7 +419,8 @@ static int db8500_thermal_probe(struct platform_device *pdev)
         low_irq = platform_get_irq_byname(pdev, "IRQ_HOTMON_LOW");
         if (low_irq < 0) {
                 dev_err(&pdev->dev, "Get IRQ_HOTMON_LOW failed.\n");
-               return low_irq;
+               ret = low_irq;
+               goto out_unlock;
         }
  
         ret = devm_request_threaded_irq(&pdev->dev, low_irq, NULL,
@@ -427,13 +428,14 @@ static int db8500_thermal_probe(struct platform_device *pdev)
                 "dbx500_temp_low", pzone);
         if (ret < 0) {
                 dev_err(&pdev->dev, "Failed to allocate temp low irq.\n");
-               return ret;
+               goto out_unlock;
         }
  
         high_irq = platform_get_irq_byname(pdev, "IRQ_HOTMON_HIGH");
         if (high_irq < 0) {
                 dev_err(&pdev->dev, "Get IRQ_HOTMON_HIGH failed.\n");
-               return high_irq;
+               ret = high_irq;
+               goto out_unlock;
         }
  
         ret = devm_request_threaded_irq(&pdev->dev, high_irq, NULL,
@@ -441,15 +443,16 @@ static int db8500_thermal_probe(struct platform_device *pdev)
                 "dbx500_temp_high", pzone);
         if (ret < 0) {
                 dev_err(&pdev->dev, "Failed to allocate temp high irq.\n");
-               return ret;
+               goto out_unlock;
         }
  
         pzone->therm_dev = thermal_zone_device_register("db8500_thermal_zone",
                 ptrips->num_trips, 0, pzone, &thdev_ops, NULL, 0, 0);
  
-       if (IS_ERR_OR_NULL(pzone->therm_dev)) {
+       if (IS_ERR(pzone->therm_dev)) {
                 dev_err(&pdev->dev, "Register thermal zone device failed.\n");
-               return PTR_ERR(pzone->therm_dev);
+               ret = PTR_ERR(pzone->therm_dev);
+               goto out_unlock;
         }
         dev_info(&pdev->dev, "Thermal zone device registered.\n");
  
@@ -461,9 +464,11 @@ static int db8500_thermal_probe(struct platform_device *pdev)
  
         platform_set_drvdata(pdev, pzone);
         pzone->mode = THERMAL_DEVICE_ENABLED;
+
+out_unlock:
         mutex_unlock(&pzone->th_lock);
  
-       return 0;
+       return ret;
  }
  
  static int db8500_thermal_remove(struct platform_device *pdev)
diff --git a/drivers/thermal/dove_thermal.c b/drivers/thermal/dove_thermal.c

index 3078c403b42d84029968b23ff20b39ebd4af295d..4b15a5f270dc71a021bde1b6e96e8656720d45bc 100644 (file)
--- a/drivers/thermal/dove_thermal.c
+++ b/drivers/thermal/dove_thermal.c
@@ -107,12 +107,13 @@ static int dove_get_temp(struct thermal_zone_device *thermal,
         }
  
         /*
-        * Calculate temperature. See Section 8.10.1 of 88AP510,
-        * Documentation/arm/Marvell/README
+        * Calculate temperature. According to Marvell internal
+        * documentation the formula for this is:
+        * Celsius = (322-reg)/1.3625
          */
         reg = readl_relaxed(priv->sensor);
         reg = (reg >> DOVE_THERMAL_TEMP_OFFSET) & DOVE_THERMAL_TEMP_MASK;
-       *temp = ((2281638UL - (7298*reg)) / 10);
+       *temp = ((3220000000UL - (10000000UL * reg)) / 13625);
  
         return 0;
  }
diff --git a/drivers/thermal/exynos_thermal.c b/drivers/thermal/exynos_thermal.c

index b777ae6f0a8fed33d44701b867239e4973d37749..d20ce9e614034ba4077d797b5584cfad4d994727 100644 (file)
--- a/drivers/thermal/exynos_thermal.c
+++ b/drivers/thermal/exynos_thermal.c
@@ -98,13 +98,13 @@
  #define IDLE_INTERVAL 10000
  #define MCELSIUS       1000
  
-#ifdef CONFIG_EXYNOS_THERMAL_EMUL
+#ifdef CONFIG_THERMAL_EMULATION
  #define EXYNOS_EMUL_TIME       0x57F0
  #define EXYNOS_EMUL_TIME_SHIFT 16
  #define EXYNOS_EMUL_DATA_SHIFT 8
  #define EXYNOS_EMUL_DATA_MASK  0xFF
  #define EXYNOS_EMUL_ENABLE     0x1
-#endif /* CONFIG_EXYNOS_THERMAL_EMUL */
+#endif /* CONFIG_THERMAL_EMULATION */
  
  /* CPU Zone information */
  #define PANIC_ZONE      4
@@ -143,6 +143,7 @@ struct      thermal_cooling_conf {
  struct thermal_sensor_conf {
         char name[SENSOR_NAME_LEN];
         int (*read_temperature)(void *data);
+       int (*write_emul_temp)(void *drv_data, unsigned long temp);
         struct thermal_trip_point_conf trip_data;
         struct thermal_cooling_conf cooling_data;
         void *private_data;
@@ -240,26 +241,6 @@ static int exynos_get_crit_temp(struct thermal_zone_device *thermal,
         return ret;
  }
  
-static int exynos_get_frequency_level(unsigned int cpu, unsigned int freq)
-{
-       int i = 0, ret = -EINVAL;
-       struct cpufreq_frequency_table *table = NULL;
-#ifdef CONFIG_CPU_FREQ
-       table = cpufreq_frequency_get_table(cpu);
-#endif
-       if (!table)
-               return ret;
-
-       while (table[i].frequency != CPUFREQ_TABLE_END) {
-               if (table[i].frequency == CPUFREQ_ENTRY_INVALID)
-                       continue;
-               if (table[i].frequency == freq)
-                       return i;
-               i++;
-       }
-       return ret;
-}
-
  /* Bind callback functions for thermal zone */
  static int exynos_bind(struct thermal_zone_device *thermal,
                         struct thermal_cooling_device *cdev)
@@ -286,8 +267,8 @@ static int exynos_bind(struct thermal_zone_device *thermal,
         /* Bind the thermal zone to the cpufreq cooling device */
         for (i = 0; i < tab_size; i++) {
                 clip_data = (struct freq_clip_table *)&(tab_ptr[i]);
-               level = exynos_get_frequency_level(0, clip_data->freq_clip_max);
-               if (level < 0)
+               level = cpufreq_cooling_get_level(0, clip_data->freq_clip_max);
+               if (level == THERMAL_CSTATE_INVALID)
                         return 0;
                 switch (GET_ZONE(i)) {
                 case MONITOR_ZONE:
@@ -367,6 +348,23 @@ static int exynos_get_temp(struct thermal_zone_device *thermal,
         return 0;
  }
  
+/* Get temperature callback functions for thermal zone */
+static int exynos_set_emul_temp(struct thermal_zone_device *thermal,
+                                               unsigned long temp)
+{
+       void *data;
+       int ret = -EINVAL;
+
+       if (!th_zone->sensor_conf) {
+               pr_info("Temperature sensor not initialised\n");
+               return -EINVAL;
+       }
+       data = th_zone->sensor_conf->private_data;
+       if (th_zone->sensor_conf->write_emul_temp)
+               ret = th_zone->sensor_conf->write_emul_temp(data, temp);
+       return ret;
+}
+
  /* Get the temperature trend */
  static int exynos_get_trend(struct thermal_zone_device *thermal,
                         int trip, enum thermal_trend *trend)
@@ -390,6 +388,7 @@ static struct thermal_zone_device_ops const exynos_dev_ops = {
         .bind = exynos_bind,
         .unbind = exynos_unbind,
         .get_temp = exynos_get_temp,
+       .set_emul_temp = exynos_set_emul_temp,
         .get_trend = exynos_get_trend,
         .get_mode = exynos_get_mode,
         .set_mode = exynos_set_mode,
@@ -712,6 +711,47 @@ static int exynos_tmu_read(struct exynos_tmu_data *data)
         return temp;
  }
  
+#ifdef CONFIG_THERMAL_EMULATION
+static int exynos_tmu_set_emulation(void *drv_data, unsigned long temp)
+{
+       struct exynos_tmu_data *data = drv_data;
+       unsigned int reg;
+       int ret = -EINVAL;
+
+       if (data->soc == SOC_ARCH_EXYNOS4210)
+               goto out;
+
+       if (temp && temp < MCELSIUS)
+               goto out;
+
+       mutex_lock(&data->lock);
+       clk_enable(data->clk);
+
+       reg = readl(data->base + EXYNOS_EMUL_CON);
+
+       if (temp) {
+               temp /= MCELSIUS;
+
+               reg = (EXYNOS_EMUL_TIME << EXYNOS_EMUL_TIME_SHIFT) |
+                       (temp_to_code(data, temp)
+                        << EXYNOS_EMUL_DATA_SHIFT) | EXYNOS_EMUL_ENABLE;
+       } else {
+               reg &= ~EXYNOS_EMUL_ENABLE;
+       }
+
+       writel(reg, data->base + EXYNOS_EMUL_CON);
+
+       clk_disable(data->clk);
+       mutex_unlock(&data->lock);
+       return 0;
+out:
+       return ret;
+}
+#else
+static int exynos_tmu_set_emulation(void *drv_data,    unsigned long temp)
+       { return -EINVAL; }
+#endif/*CONFIG_THERMAL_EMULATION*/
+
  static void exynos_tmu_work(struct work_struct *work)
  {
         struct exynos_tmu_data *data = container_of(work,
@@ -745,6 +785,7 @@ static irqreturn_t exynos_tmu_irq(int irq, void *id)
  static struct thermal_sensor_conf exynos_sensor_conf = {
         .name                   = "exynos-therm",
         .read_temperature       = (int (*)(void *))exynos_tmu_read,
+       .write_emul_temp        = exynos_tmu_set_emulation,
  };
  
  #if defined(CONFIG_CPU_EXYNOS4210)
@@ -813,6 +854,10 @@ static const struct of_device_id exynos_tmu_match[] = {
                 .compatible = "samsung,exynos4210-tmu",
                 .data = (void *)EXYNOS4210_TMU_DRV_DATA,
         },
+       {
+               .compatible = "samsung,exynos4412-tmu",
+               .data = (void *)EXYNOS_TMU_DRV_DATA,
+       },
         {
                 .compatible = "samsung,exynos5250-tmu",
                 .data = (void *)EXYNOS_TMU_DRV_DATA,
@@ -851,93 +896,6 @@ static inline struct  exynos_tmu_platform_data *exynos_get_driver_data(
                         platform_get_device_id(pdev)->driver_data;
  }
  
-#ifdef CONFIG_EXYNOS_THERMAL_EMUL
-static ssize_t exynos_tmu_emulation_show(struct device *dev,
-                                        struct device_attribute *attr,
-                                        char *buf)
-{
-       struct platform_device *pdev = container_of(dev,
-                                       struct platform_device, dev);
-       struct exynos_tmu_data *data = platform_get_drvdata(pdev);
-       unsigned int reg;
-       u8 temp_code;
-       int temp = 0;
-
-       if (data->soc == SOC_ARCH_EXYNOS4210)
-               goto out;
-
-       mutex_lock(&data->lock);
-       clk_enable(data->clk);
-       reg = readl(data->base + EXYNOS_EMUL_CON);
-       clk_disable(data->clk);
-       mutex_unlock(&data->lock);
-
-       if (reg & EXYNOS_EMUL_ENABLE) {
-               reg >>= EXYNOS_EMUL_DATA_SHIFT;
-               temp_code = reg & EXYNOS_EMUL_DATA_MASK;
-               temp = code_to_temp(data, temp_code);
-       }
-out:
-       return sprintf(buf, "%d\n", temp * MCELSIUS);
-}
-
-static ssize_t exynos_tmu_emulation_store(struct device *dev,
-                                       struct device_attribute *attr,
-                                       const char *buf, size_t count)
-{
-       struct platform_device *pdev = container_of(dev,
-                                       struct platform_device, dev);
-       struct exynos_tmu_data *data = platform_get_drvdata(pdev);
-       unsigned int reg;
-       int temp;
-
-       if (data->soc == SOC_ARCH_EXYNOS4210)
-               goto out;
-
-       if (!sscanf(buf, "%d\n", &temp) || temp < 0)
-               return -EINVAL;
-
-       mutex_lock(&data->lock);
-       clk_enable(data->clk);
-
-       reg = readl(data->base + EXYNOS_EMUL_CON);
-
-       if (temp) {
-               /* Both CELSIUS and MCELSIUS type are available for input */
-               if (temp > MCELSIUS)
-                       temp /= MCELSIUS;
-
-               reg = (EXYNOS_EMUL_TIME << EXYNOS_EMUL_TIME_SHIFT) |
-                       (temp_to_code(data, (temp / MCELSIUS))
-                        << EXYNOS_EMUL_DATA_SHIFT) | EXYNOS_EMUL_ENABLE;
-       } else {
-               reg &= ~EXYNOS_EMUL_ENABLE;
-       }
-
-       writel(reg, data->base + EXYNOS_EMUL_CON);
-
-       clk_disable(data->clk);
-       mutex_unlock(&data->lock);
-
-out:
-       return count;
-}
-
-static DEVICE_ATTR(emulation, 0644, exynos_tmu_emulation_show,
-                                       exynos_tmu_emulation_store);
-static int create_emulation_sysfs(struct device *dev)
-{
-       return device_create_file(dev, &dev_attr_emulation);
-}
-static void remove_emulation_sysfs(struct device *dev)
-{
-       device_remove_file(dev, &dev_attr_emulation);
-}
-#else
-static inline int create_emulation_sysfs(struct device *dev) { return 0; }
-static inline void remove_emulation_sysfs(struct device *dev) {}
-#endif
-
  static int exynos_tmu_probe(struct platform_device *pdev)
  {
         struct exynos_tmu_data *data;
@@ -983,12 +941,16 @@ static int exynos_tmu_probe(struct platform_device *pdev)
                 return ret;
         }
  
-       data->clk = clk_get(NULL, "tmu_apbif");
+       data->clk = devm_clk_get(&pdev->dev, "tmu_apbif");
         if (IS_ERR(data->clk)) {
                 dev_err(&pdev->dev, "Failed to get clock\n");
                 return  PTR_ERR(data->clk);
         }
  
+       ret = clk_prepare(data->clk);
+       if (ret)
+               return ret;
+
         if (pdata->type == SOC_ARCH_EXYNOS ||
                                 pdata->type == SOC_ARCH_EXYNOS4210)
                 data->soc = pdata->type;
@@ -1037,14 +999,10 @@ static int exynos_tmu_probe(struct platform_device *pdev)
                 goto err_clk;
         }
  
-       ret = create_emulation_sysfs(&pdev->dev);
-       if (ret)
-               dev_err(&pdev->dev, "Failed to create emulation mode sysfs node\n");
-
         return 0;
  err_clk:
         platform_set_drvdata(pdev, NULL);
-       clk_put(data->clk);
+       clk_unprepare(data->clk);
         return ret;
  }
  
@@ -1052,13 +1010,11 @@ static int exynos_tmu_remove(struct platform_device *pdev)
  {
         struct exynos_tmu_data *data = platform_get_drvdata(pdev);
  
-       remove_emulation_sysfs(&pdev->dev);
-
         exynos_tmu_control(pdev, false);
  
         exynos_unregister_thermal();
  
-       clk_put(data->clk);
+       clk_unprepare(data->clk);
  
         platform_set_drvdata(pdev, NULL);
  
diff --git a/drivers/thermal/fair_share.c b/drivers/thermal/fair_share.c

index 792479f2b64b7d25b4e1f8a1ac68de8271c65f84..944ba2f340c83cfed436c569c949d58dc6a60dfe 100644 (file)
--- a/drivers/thermal/fair_share.c
+++ b/drivers/thermal/fair_share.c
@@ -22,9 +22,6 @@
   * ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
   */
  
-#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
-
-#include <linux/module.h>
  #include <linux/thermal.h>
  
  #include "thermal_core.h"
@@ -111,23 +108,15 @@ static int fair_share_throttle(struct thermal_zone_device *tz, int trip)
  static struct thermal_governor thermal_gov_fair_share = {
         .name           = "fair_share",
         .throttle       = fair_share_throttle,
-       .owner          = THIS_MODULE,
  };
  
-static int __init thermal_gov_fair_share_init(void)
+int thermal_gov_fair_share_register(void)
  {
         return thermal_register_governor(&thermal_gov_fair_share);
  }
  
-static void __exit thermal_gov_fair_share_exit(void)
+void thermal_gov_fair_share_unregister(void)
  {
         thermal_unregister_governor(&thermal_gov_fair_share);
  }
  
-/* This should load after thermal framework */
-fs_initcall(thermal_gov_fair_share_init);
-module_exit(thermal_gov_fair_share_exit);
-
-MODULE_AUTHOR("Durgadoss R");
-MODULE_DESCRIPTION("A simple weight based thermal throttling governor");
-MODULE_LICENSE("GPL");
diff --git a/drivers/thermal/kirkwood_thermal.c b/drivers/thermal/kirkwood_thermal.c

index e5500edb528568a009597b9336fa3d951df852ac..dfeceaffbc03c3f462170d721124ad3b3eac1db9 100644 (file)
--- a/drivers/thermal/kirkwood_thermal.c
+++ b/drivers/thermal/kirkwood_thermal.c
@@ -41,21 +41,21 @@ static int kirkwood_get_temp(struct thermal_zone_device *thermal,
         reg = readl_relaxed(priv->sensor);
  
         /* Valid check */
-       if (!(reg >> KIRKWOOD_THERMAL_VALID_OFFSET) &
-           KIRKWOOD_THERMAL_VALID_MASK) {
+       if (!((reg >> KIRKWOOD_THERMAL_VALID_OFFSET) &
+           KIRKWOOD_THERMAL_VALID_MASK)) {
                 dev_err(&thermal->device,
                         "Temperature sensor reading not valid\n");
                 return -EIO;
         }
  
         /*
-        * Calculate temperature. See Section 8.10.1 of the 88AP510,
-        * datasheet, which has the same sensor.
-        * Documentation/arm/Marvell/README
+        * Calculate temperature. According to Marvell internal
+        * documentation the formula for this is:
+        * Celsius = (322-reg)/1.3625
          */
         reg = (reg >> KIRKWOOD_THERMAL_TEMP_OFFSET) &
                 KIRKWOOD_THERMAL_TEMP_MASK;
-       *temp = ((2281638UL - (7298*reg)) / 10);
+       *temp = ((3220000000UL - (10000000UL * reg)) / 13625);
  
         return 0;
  }
diff --git a/drivers/thermal/rcar_thermal.c b/drivers/thermal/rcar_thermal.c

index 2cc5b6115e3e200d13955f8103b97a2ddf3c0514..8d7edd4c82285f2bce93fd811b013bb4f6a9a54a 100644 (file)
--- a/drivers/thermal/rcar_thermal.c
+++ b/drivers/thermal/rcar_thermal.c
@@ -24,6 +24,7 @@
  #include <linux/io.h>
  #include <linux/module.h>
  #include <linux/platform_device.h>
+#include <linux/pm_runtime.h>
  #include <linux/reboot.h>
  #include <linux/slab.h>
  #include <linux/spinlock.h>
@@ -377,6 +378,9 @@ static int rcar_thermal_probe(struct platform_device *pdev)
         spin_lock_init(&common->lock);
         common->dev = dev;
  
+       pm_runtime_enable(dev);
+       pm_runtime_get_sync(dev);
+
         irq = platform_get_resource(pdev, IORESOURCE_IRQ, 0);
         if (irq) {
                 int ret;
@@ -419,12 +423,15 @@ static int rcar_thermal_probe(struct platform_device *pdev)
                 priv = devm_kzalloc(dev, sizeof(*priv), GFP_KERNEL);
                 if (!priv) {
                         dev_err(dev, "Could not allocate priv\n");
-                       return -ENOMEM;
+                       ret = -ENOMEM;
+                       goto error_unregister;
                 }
  
                 priv->base = devm_ioremap_resource(dev, res);
-               if (IS_ERR(priv->base))
-                       return PTR_ERR(priv->base);
+               if (IS_ERR(priv->base)) {
+                       ret = PTR_ERR(priv->base);
+                       goto error_unregister;
+               }
  
                 priv->common = common;
                 priv->id = i;
@@ -443,10 +450,10 @@ static int rcar_thermal_probe(struct platform_device *pdev)
                         goto error_unregister;
                 }
  
-               list_move_tail(&priv->list, &common->head);
-
                 if (rcar_has_irq_support(priv))
                         rcar_thermal_irq_enable(priv);
+
+               list_move_tail(&priv->list, &common->head);
         }
  
         platform_set_drvdata(pdev, common);
@@ -456,8 +463,14 @@ static int rcar_thermal_probe(struct platform_device *pdev)
         return 0;
  
  error_unregister:
-       rcar_thermal_for_each_priv(priv, common)
+       rcar_thermal_for_each_priv(priv, common) {
                 thermal_zone_device_unregister(priv->zone);
+               if (rcar_has_irq_support(priv))
+                       rcar_thermal_irq_disable(priv);
+       }
+
+       pm_runtime_put_sync(dev);
+       pm_runtime_disable(dev);
  
         return ret;
  }
@@ -465,13 +478,20 @@ error_unregister:
  static int rcar_thermal_remove(struct platform_device *pdev)
  {
         struct rcar_thermal_common *common = platform_get_drvdata(pdev);
+       struct device *dev = &pdev->dev;
         struct rcar_thermal_priv *priv;
  
-       rcar_thermal_for_each_priv(priv, common)
+       rcar_thermal_for_each_priv(priv, common) {
                 thermal_zone_device_unregister(priv->zone);
+               if (rcar_has_irq_support(priv))
+                       rcar_thermal_irq_disable(priv);
+       }
  
         platform_set_drvdata(pdev, NULL);
  
+       pm_runtime_put_sync(dev);
+       pm_runtime_disable(dev);
+
         return 0;
  }
  
diff --git a/drivers/thermal/step_wise.c b/drivers/thermal/step_wise.c

index 407cde3211c1bccfcf1e43515fb0a47a5a99e67c..4d4ddae1a99183cee9705f24e7acdc91cde62cc6 100644 (file)
--- a/drivers/thermal/step_wise.c
+++ b/drivers/thermal/step_wise.c
@@ -22,9 +22,6 @@
   * ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
   */
  
-#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
-
-#include <linux/module.h>
  #include <linux/thermal.h>
  
  #include "thermal_core.h"
@@ -59,9 +56,12 @@ static unsigned long get_target_state(struct thermal_instance *instance,
  
         switch (trend) {
         case THERMAL_TREND_RAISING:
-               if (throttle)
+               if (throttle) {
                         cur_state = cur_state < instance->upper ?
                                     (cur_state + 1) : instance->upper;
+                       if (cur_state < instance->lower)
+                               cur_state = instance->lower;
+               }
                 break;
         case THERMAL_TREND_RAISE_FULL:
                 if (throttle)
@@ -71,8 +71,11 @@ static unsigned long get_target_state(struct thermal_instance *instance,
                 if (cur_state == instance->lower) {
                         if (!throttle)
                                 cur_state = -1;
-               } else
+               } else {
                         cur_state -= 1;
+                       if (cur_state > instance->upper)
+                               cur_state = instance->upper;
+               }
                 break;
         case THERMAL_TREND_DROP_FULL:
                 if (cur_state == instance->lower) {
@@ -180,23 +183,14 @@ static int step_wise_throttle(struct thermal_zone_device *tz, int trip)
  static struct thermal_governor thermal_gov_step_wise = {
         .name           = "step_wise",
         .throttle       = step_wise_throttle,
-       .owner          = THIS_MODULE,
  };
  
-static int __init thermal_gov_step_wise_init(void)
+int thermal_gov_step_wise_register(void)
  {
         return thermal_register_governor(&thermal_gov_step_wise);
  }
  
-static void __exit thermal_gov_step_wise_exit(void)
+void thermal_gov_step_wise_unregister(void)
  {
         thermal_unregister_governor(&thermal_gov_step_wise);
  }
-
-/* This should load after thermal framework */
-fs_initcall(thermal_gov_step_wise_init);
-module_exit(thermal_gov_step_wise_exit);
-
-MODULE_AUTHOR("Durgadoss R");
-MODULE_DESCRIPTION("A step-by-step thermal throttling governor");
-MODULE_LICENSE("GPL");
diff --git a/drivers/thermal/thermal_core.c b/drivers/thermal/thermal_core.c

new file mode 100644 (file)

index 0000000..d755440
--- /dev/null
+++ b/drivers/thermal/thermal_core.c
@@ -0,0 +1,2011 @@
+/*
+ *  thermal.c - Generic Thermal Management Sysfs support.
+ *
+ *  Copyright (C) 2008 Intel Corp
+ *  Copyright (C) 2008 Zhang Rui <rui.zhang@intel.com>
+ *  Copyright (C) 2008 Sujith Thomas <sujith.thomas@intel.com>
+ *
+ *  ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+ *
+ *  This program is free software; you can redistribute it and/or modify
+ *  it under the terms of the GNU General Public License as published by
+ *  the Free Software Foundation; version 2 of the License.
+ *
+ *  This program is distributed in the hope that it will be useful, but
+ *  WITHOUT ANY WARRANTY; without even the implied warranty of
+ *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ *  General Public License for more details.
+ *
+ *  You should have received a copy of the GNU General Public License along
+ *  with this program; if not, write to the Free Software Foundation, Inc.,
+ *  59 Temple Place, Suite 330, Boston, MA 02111-1307 USA.
+ *
+ * ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+ */
+
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
+
+#include <linux/module.h>
+#include <linux/device.h>
+#include <linux/err.h>
+#include <linux/slab.h>
+#include <linux/kdev_t.h>
+#include <linux/idr.h>
+#include <linux/thermal.h>
+#include <linux/reboot.h>
+#include <net/netlink.h>
+#include <net/genetlink.h>
+
+#include "thermal_core.h"
+
+MODULE_AUTHOR("Zhang Rui");
+MODULE_DESCRIPTION("Generic thermal management sysfs support");
+MODULE_LICENSE("GPL v2");
+
+static DEFINE_IDR(thermal_tz_idr);
+static DEFINE_IDR(thermal_cdev_idr);
+static DEFINE_MUTEX(thermal_idr_lock);
+
+static LIST_HEAD(thermal_tz_list);
+static LIST_HEAD(thermal_cdev_list);
+static LIST_HEAD(thermal_governor_list);
+
+static DEFINE_MUTEX(thermal_list_lock);
+static DEFINE_MUTEX(thermal_governor_lock);
+
+static struct thermal_governor *__find_governor(const char *name)
+{
+       struct thermal_governor *pos;
+
+       list_for_each_entry(pos, &thermal_governor_list, governor_list)
+               if (!strnicmp(name, pos->name, THERMAL_NAME_LENGTH))
+                       return pos;
+
+       return NULL;
+}
+
+int thermal_register_governor(struct thermal_governor *governor)
+{
+       int err;
+       const char *name;
+       struct thermal_zone_device *pos;
+
+       if (!governor)
+               return -EINVAL;
+
+       mutex_lock(&thermal_governor_lock);
+
+       err = -EBUSY;
+       if (__find_governor(governor->name) == NULL) {
+               err = 0;
+               list_add(&governor->governor_list, &thermal_governor_list);
+       }
+
+       mutex_lock(&thermal_list_lock);
+
+       list_for_each_entry(pos, &thermal_tz_list, node) {
+               if (pos->governor)
+                       continue;
+               if (pos->tzp)
+                       name = pos->tzp->governor_name;
+               else
+                       name = DEFAULT_THERMAL_GOVERNOR;
+               if (!strnicmp(name, governor->name, THERMAL_NAME_LENGTH))
+                       pos->governor = governor;
+       }
+
+       mutex_unlock(&thermal_list_lock);
+       mutex_unlock(&thermal_governor_lock);
+
+       return err;
+}
+
+void thermal_unregister_governor(struct thermal_governor *governor)
+{
+       struct thermal_zone_device *pos;
+
+       if (!governor)
+               return;
+
+       mutex_lock(&thermal_governor_lock);
+
+       if (__find_governor(governor->name) == NULL)
+               goto exit;
+
+       mutex_lock(&thermal_list_lock);
+
+       list_for_each_entry(pos, &thermal_tz_list, node) {
+               if (!strnicmp(pos->governor->name, governor->name,
+                                               THERMAL_NAME_LENGTH))
+                       pos->governor = NULL;
+       }
+
+       mutex_unlock(&thermal_list_lock);
+       list_del(&governor->governor_list);
+exit:
+       mutex_unlock(&thermal_governor_lock);
+       return;
+}
+
+static int get_idr(struct idr *idr, struct mutex *lock, int *id)
+{
+       int ret;
+
+       if (lock)
+               mutex_lock(lock);
+       ret = idr_alloc(idr, NULL, 0, 0, GFP_KERNEL);
+       if (lock)
+               mutex_unlock(lock);
+       if (unlikely(ret < 0))
+               return ret;
+       *id = ret;
+       return 0;
+}
+
+static void release_idr(struct idr *idr, struct mutex *lock, int id)
+{
+       if (lock)
+               mutex_lock(lock);
+       idr_remove(idr, id);
+       if (lock)
+               mutex_unlock(lock);
+}
+
+int get_tz_trend(struct thermal_zone_device *tz, int trip)
+{
+       enum thermal_trend trend;
+
+       if (!tz->ops->get_trend || tz->ops->get_trend(tz, trip, &trend)) {
+               if (tz->temperature > tz->last_temperature)
+                       trend = THERMAL_TREND_RAISING;
+               else if (tz->temperature < tz->last_temperature)
+                       trend = THERMAL_TREND_DROPPING;
+               else
+                       trend = THERMAL_TREND_STABLE;
+       }
+
+       return trend;
+}
+EXPORT_SYMBOL(get_tz_trend);
+
+struct thermal_instance *get_thermal_instance(struct thermal_zone_device *tz,
+                       struct thermal_cooling_device *cdev, int trip)
+{
+       struct thermal_instance *pos = NULL;
+       struct thermal_instance *target_instance = NULL;
+
+       mutex_lock(&tz->lock);
+       mutex_lock(&cdev->lock);
+
+       list_for_each_entry(pos, &tz->thermal_instances, tz_node) {
+               if (pos->tz == tz && pos->trip == trip && pos->cdev == cdev) {
+                       target_instance = pos;
+                       break;
+               }
+       }
+
+       mutex_unlock(&cdev->lock);
+       mutex_unlock(&tz->lock);
+
+       return target_instance;
+}
+EXPORT_SYMBOL(get_thermal_instance);
+
+static void print_bind_err_msg(struct thermal_zone_device *tz,
+                       struct thermal_cooling_device *cdev, int ret)
+{
+       dev_err(&tz->device, "binding zone %s with cdev %s failed:%d\n",
+                               tz->type, cdev->type, ret);
+}
+
+static void __bind(struct thermal_zone_device *tz, int mask,
+                       struct thermal_cooling_device *cdev)
+{
+       int i, ret;
+
+       for (i = 0; i < tz->trips; i++) {
+               if (mask & (1 << i)) {
+                       ret = thermal_zone_bind_cooling_device(tz, i, cdev,
+                                       THERMAL_NO_LIMIT, THERMAL_NO_LIMIT);
+                       if (ret)
+                               print_bind_err_msg(tz, cdev, ret);
+               }
+       }
+}
+
+static void __unbind(struct thermal_zone_device *tz, int mask,
+                       struct thermal_cooling_device *cdev)
+{
+       int i;
+
+       for (i = 0; i < tz->trips; i++)
+               if (mask & (1 << i))
+                       thermal_zone_unbind_cooling_device(tz, i, cdev);
+}
+
+static void bind_cdev(struct thermal_cooling_device *cdev)
+{
+       int i, ret;
+       const struct thermal_zone_params *tzp;
+       struct thermal_zone_device *pos = NULL;
+
+       mutex_lock(&thermal_list_lock);
+
+       list_for_each_entry(pos, &thermal_tz_list, node) {
+               if (!pos->tzp && !pos->ops->bind)
+                       continue;
+
+               if (!pos->tzp && pos->ops->bind) {
+                       ret = pos->ops->bind(pos, cdev);
+                       if (ret)
+                               print_bind_err_msg(pos, cdev, ret);
+               }
+
+               tzp = pos->tzp;
+               if (!tzp || !tzp->tbp)
+                       continue;
+
+               for (i = 0; i < tzp->num_tbps; i++) {
+                       if (tzp->tbp[i].cdev || !tzp->tbp[i].match)
+                               continue;
+                       if (tzp->tbp[i].match(pos, cdev))
+                               continue;
+                       tzp->tbp[i].cdev = cdev;
+                       __bind(pos, tzp->tbp[i].trip_mask, cdev);
+               }
+       }
+
+       mutex_unlock(&thermal_list_lock);
+}
+
+static void bind_tz(struct thermal_zone_device *tz)
+{
+       int i, ret;
+       struct thermal_cooling_device *pos = NULL;
+       const struct thermal_zone_params *tzp = tz->tzp;
+
+       if (!tzp && !tz->ops->bind)
+               return;
+
+       mutex_lock(&thermal_list_lock);
+
+       /* If there is no platform data, try to use ops->bind */
+       if (!tzp && tz->ops->bind) {
+               list_for_each_entry(pos, &thermal_cdev_list, node) {
+                       ret = tz->ops->bind(tz, pos);
+                       if (ret)
+                               print_bind_err_msg(tz, pos, ret);
+               }
+               goto exit;
+       }
+
+       if (!tzp || !tzp->tbp)
+               goto exit;
+
+       list_for_each_entry(pos, &thermal_cdev_list, node) {
+               for (i = 0; i < tzp->num_tbps; i++) {
+                       if (tzp->tbp[i].cdev || !tzp->tbp[i].match)
+                               continue;
+                       if (tzp->tbp[i].match(tz, pos))
+                               continue;
+                       tzp->tbp[i].cdev = pos;
+                       __bind(tz, tzp->tbp[i].trip_mask, pos);
+               }
+       }
+exit:
+       mutex_unlock(&thermal_list_lock);
+}
+
+static void thermal_zone_device_set_polling(struct thermal_zone_device *tz,
+                                           int delay)
+{
+       if (delay > 1000)
+               mod_delayed_work(system_freezable_wq, &tz->poll_queue,
+                                round_jiffies(msecs_to_jiffies(delay)));
+       else if (delay)
+               mod_delayed_work(system_freezable_wq, &tz->poll_queue,
+                                msecs_to_jiffies(delay));
+       else
+               cancel_delayed_work(&tz->poll_queue);
+}
+
+static void monitor_thermal_zone(struct thermal_zone_device *tz)
+{
+       mutex_lock(&tz->lock);
+
+       if (tz->passive)
+               thermal_zone_device_set_polling(tz, tz->passive_delay);
+       else if (tz->polling_delay)
+               thermal_zone_device_set_polling(tz, tz->polling_delay);
+       else
+               thermal_zone_device_set_polling(tz, 0);
+
+       mutex_unlock(&tz->lock);
+}
+
+static void handle_non_critical_trips(struct thermal_zone_device *tz,
+                       int trip, enum thermal_trip_type trip_type)
+{
+       if (tz->governor)
+               tz->governor->throttle(tz, trip);
+}
+
+static void handle_critical_trips(struct thermal_zone_device *tz,
+                               int trip, enum thermal_trip_type trip_type)
+{
+       long trip_temp;
+
+       tz->ops->get_trip_temp(tz, trip, &trip_temp);
+
+       /* If we have not crossed the trip_temp, we do not care. */
+       if (tz->temperature < trip_temp)
+               return;
+
+       if (tz->ops->notify)
+               tz->ops->notify(tz, trip, trip_type);
+
+       if (trip_type == THERMAL_TRIP_CRITICAL) {
+               dev_emerg(&tz->device,
+                         "critical temperature reached(%d C),shutting down\n",
+                         tz->temperature / 1000);
+               orderly_poweroff(true);
+       }
+}
+
+static void handle_thermal_trip(struct thermal_zone_device *tz, int trip)
+{
+       enum thermal_trip_type type;
+
+       tz->ops->get_trip_type(tz, trip, &type);
+
+       if (type == THERMAL_TRIP_CRITICAL || type == THERMAL_TRIP_HOT)
+               handle_critical_trips(tz, trip, type);
+       else
+               handle_non_critical_trips(tz, trip, type);
+       /*
+        * Alright, we handled this trip successfully.
+        * So, start monitoring again.
+        */
+       monitor_thermal_zone(tz);
+}
+
+/**
+ * thermal_zone_get_temp() - returns its the temperature of thermal zone
+ * @tz: a valid pointer to a struct thermal_zone_device
+ * @temp: a valid pointer to where to store the resulting temperature.
+ *
+ * When a valid thermal zone reference is passed, it will fetch its
+ * temperature and fill @temp.
+ *
+ * Return: On success returns 0, an error code otherwise
+ */
+int thermal_zone_get_temp(struct thermal_zone_device *tz, unsigned long *temp)
+{
+       int ret = -EINVAL;
+#ifdef CONFIG_THERMAL_EMULATION
+       int count;
+       unsigned long crit_temp = -1UL;
+       enum thermal_trip_type type;
+#endif
+
+       if (!tz || IS_ERR(tz))
+               goto exit;
+
+       mutex_lock(&tz->lock);
+
+       ret = tz->ops->get_temp(tz, temp);
+#ifdef CONFIG_THERMAL_EMULATION
+       if (!tz->emul_temperature)
+               goto skip_emul;
+
+       for (count = 0; count < tz->trips; count++) {
+               ret = tz->ops->get_trip_type(tz, count, &type);
+               if (!ret && type == THERMAL_TRIP_CRITICAL) {
+                       ret = tz->ops->get_trip_temp(tz, count, &crit_temp);
+                       break;
+               }
+       }
+
+       if (ret)
+               goto skip_emul;
+
+       if (*temp < crit_temp)
+               *temp = tz->emul_temperature;
+skip_emul:
+#endif
+       mutex_unlock(&tz->lock);
+exit:
+       return ret;
+}
+EXPORT_SYMBOL_GPL(thermal_zone_get_temp);
+
+static void update_temperature(struct thermal_zone_device *tz)
+{
+       long temp;
+       int ret;
+
+       ret = thermal_zone_get_temp(tz, &temp);
+       if (ret) {
+               dev_warn(&tz->device, "failed to read out thermal zone %d\n",
+                        tz->id);
+               return;
+       }
+
+       mutex_lock(&tz->lock);
+       tz->last_temperature = tz->temperature;
+       tz->temperature = temp;
+       mutex_unlock(&tz->lock);
+}
+
+void thermal_zone_device_update(struct thermal_zone_device *tz)
+{
+       int count;
+
+       update_temperature(tz);
+
+       for (count = 0; count < tz->trips; count++)
+               handle_thermal_trip(tz, count);
+}
+EXPORT_SYMBOL_GPL(thermal_zone_device_update);
+
+static void thermal_zone_device_check(struct work_struct *work)
+{
+       struct thermal_zone_device *tz = container_of(work, struct
+                                                     thermal_zone_device,
+                                                     poll_queue.work);
+       thermal_zone_device_update(tz);
+}
+
+/* sys I/F for thermal zone */
+
+#define to_thermal_zone(_dev) \
+       container_of(_dev, struct thermal_zone_device, device)
+
+static ssize_t
+type_show(struct device *dev, struct device_attribute *attr, char *buf)
+{
+       struct thermal_zone_device *tz = to_thermal_zone(dev);
+
+       return sprintf(buf, "%s\n", tz->type);
+}
+
+static ssize_t
+temp_show(struct device *dev, struct device_attribute *attr, char *buf)
+{
+       struct thermal_zone_device *tz = to_thermal_zone(dev);
+       long temperature;
+       int ret;
+
+       ret = thermal_zone_get_temp(tz, &temperature);
+
+       if (ret)
+               return ret;
+
+       return sprintf(buf, "%ld\n", temperature);
+}
+
+static ssize_t
+mode_show(struct device *dev, struct device_attribute *attr, char *buf)
+{
+       struct thermal_zone_device *tz = to_thermal_zone(dev);
+       enum thermal_device_mode mode;
+       int result;
+
+       if (!tz->ops->get_mode)
+               return -EPERM;
+
+       result = tz->ops->get_mode(tz, &mode);
+       if (result)
+               return result;
+
+       return sprintf(buf, "%s\n", mode == THERMAL_DEVICE_ENABLED ? "enabled"
+                      : "disabled");
+}
+
+static ssize_t
+mode_store(struct device *dev, struct device_attribute *attr,
+          const char *buf, size_t count)
+{
+       struct thermal_zone_device *tz = to_thermal_zone(dev);
+       int result;
+
+       if (!tz->ops->set_mode)
+               return -EPERM;
+
+       if (!strncmp(buf, "enabled", sizeof("enabled") - 1))
+               result = tz->ops->set_mode(tz, THERMAL_DEVICE_ENABLED);
+       else if (!strncmp(buf, "disabled", sizeof("disabled") - 1))
+               result = tz->ops->set_mode(tz, THERMAL_DEVICE_DISABLED);
+       else
+               result = -EINVAL;
+
+       if (result)
+               return result;
+
+       return count;
+}
+
+static ssize_t
+trip_point_type_show(struct device *dev, struct device_attribute *attr,
+                    char *buf)
+{
+       struct thermal_zone_device *tz = to_thermal_zone(dev);
+       enum thermal_trip_type type;
+       int trip, result;
+
+       if (!tz->ops->get_trip_type)
+               return -EPERM;
+
+       if (!sscanf(attr->attr.name, "trip_point_%d_type", &trip))
+               return -EINVAL;
+
+       result = tz->ops->get_trip_type(tz, trip, &type);
+       if (result)
+               return result;
+
+       switch (type) {
+       case THERMAL_TRIP_CRITICAL:
+               return sprintf(buf, "critical\n");
+       case THERMAL_TRIP_HOT:
+               return sprintf(buf, "hot\n");
+       case THERMAL_TRIP_PASSIVE:
+               return sprintf(buf, "passive\n");
+       case THERMAL_TRIP_ACTIVE:
+               return sprintf(buf, "active\n");
+       default:
+               return sprintf(buf, "unknown\n");
+       }
+}
+
+static ssize_t
+trip_point_temp_store(struct device *dev, struct device_attribute *attr,
+                    const char *buf, size_t count)
+{
+       struct thermal_zone_device *tz = to_thermal_zone(dev);
+       int trip, ret;
+       unsigned long temperature;
+
+       if (!tz->ops->set_trip_temp)
+               return -EPERM;
+
+       if (!sscanf(attr->attr.name, "trip_point_%d_temp", &trip))
+               return -EINVAL;
+
+       if (kstrtoul(buf, 10, &temperature))
+               return -EINVAL;
+
+       ret = tz->ops->set_trip_temp(tz, trip, temperature);
+
+       return ret ? ret : count;
+}
+
+static ssize_t
+trip_point_temp_show(struct device *dev, struct device_attribute *attr,
+                    char *buf)
+{
+       struct thermal_zone_device *tz = to_thermal_zone(dev);
+       int trip, ret;
+       long temperature;
+
+       if (!tz->ops->get_trip_temp)
+               return -EPERM;
+
+       if (!sscanf(attr->attr.name, "trip_point_%d_temp", &trip))
+               return -EINVAL;
+
+       ret = tz->ops->get_trip_temp(tz, trip, &temperature);
+
+       if (ret)
+               return ret;
+
+       return sprintf(buf, "%ld\n", temperature);
+}
+
+static ssize_t
+trip_point_hyst_store(struct device *dev, struct device_attribute *attr,
+                       const char *buf, size_t count)
+{
+       struct thermal_zone_device *tz = to_thermal_zone(dev);
+       int trip, ret;
+       unsigned long temperature;
+
+       if (!tz->ops->set_trip_hyst)
+               return -EPERM;
+
+       if (!sscanf(attr->attr.name, "trip_point_%d_hyst", &trip))
+               return -EINVAL;
+
+       if (kstrtoul(buf, 10, &temperature))
+               return -EINVAL;
+
+       /*
+        * We are not doing any check on the 'temperature' value
+        * here. The driver implementing 'set_trip_hyst' has to
+        * take care of this.
+        */
+       ret = tz->ops->set_trip_hyst(tz, trip, temperature);
+
+       return ret ? ret : count;
+}
+
+static ssize_t
+trip_point_hyst_show(struct device *dev, struct device_attribute *attr,
+                       char *buf)
+{
+       struct thermal_zone_device *tz = to_thermal_zone(dev);
+       int trip, ret;
+       unsigned long temperature;
+
+       if (!tz->ops->get_trip_hyst)
+               return -EPERM;
+
+       if (!sscanf(attr->attr.name, "trip_point_%d_hyst", &trip))
+               return -EINVAL;
+
+       ret = tz->ops->get_trip_hyst(tz, trip, &temperature);
+
+       return ret ? ret : sprintf(buf, "%ld\n", temperature);
+}
+
+static ssize_t
+passive_store(struct device *dev, struct device_attribute *attr,
+                   const char *buf, size_t count)
+{
+       struct thermal_zone_device *tz = to_thermal_zone(dev);
+       struct thermal_cooling_device *cdev = NULL;
+       int state;
+
+       if (!sscanf(buf, "%d\n", &state))
+               return -EINVAL;
+
+       /* sanity check: values below 1000 millicelcius don't make sense
+        * and can cause the system to go into a thermal heart attack
+        */
+       if (state && state < 1000)
+               return -EINVAL;
+
+       if (state && !tz->forced_passive) {
+               mutex_lock(&thermal_list_lock);
+               list_for_each_entry(cdev, &thermal_cdev_list, node) {
+                       if (!strncmp("Processor", cdev->type,
+                                    sizeof("Processor")))
+                               thermal_zone_bind_cooling_device(tz,
+                                               THERMAL_TRIPS_NONE, cdev,
+                                               THERMAL_NO_LIMIT,
+                                               THERMAL_NO_LIMIT);
+               }
+               mutex_unlock(&thermal_list_lock);
+               if (!tz->passive_delay)
+                       tz->passive_delay = 1000;
+       } else if (!state && tz->forced_passive) {
+               mutex_lock(&thermal_list_lock);
+               list_for_each_entry(cdev, &thermal_cdev_list, node) {
+                       if (!strncmp("Processor", cdev->type,
+                                    sizeof("Processor")))
+                               thermal_zone_unbind_cooling_device(tz,
+                                                                  THERMAL_TRIPS_NONE,
+                                                                  cdev);
+               }
+               mutex_unlock(&thermal_list_lock);
+               tz->passive_delay = 0;
+       }
+
+       tz->forced_passive = state;
+
+       thermal_zone_device_update(tz);
+
+       return count;
+}
+
+static ssize_t
+passive_show(struct device *dev, struct device_attribute *attr,
+                  char *buf)
+{
+       struct thermal_zone_device *tz = to_thermal_zone(dev);
+
+       return sprintf(buf, "%d\n", tz->forced_passive);
+}
+
+static ssize_t
+policy_store(struct device *dev, struct device_attribute *attr,
+                   const char *buf, size_t count)
+{
+       int ret = -EINVAL;
+       struct thermal_zone_device *tz = to_thermal_zone(dev);
+       struct thermal_governor *gov;
+
+       mutex_lock(&thermal_governor_lock);
+
+       gov = __find_governor(buf);
+       if (!gov)
+               goto exit;
+
+       tz->governor = gov;
+       ret = count;
+
+exit:
+       mutex_unlock(&thermal_governor_lock);
+       return ret;
+}
+
+static ssize_t
+policy_show(struct device *dev, struct device_attribute *devattr, char *buf)
+{
+       struct thermal_zone_device *tz = to_thermal_zone(dev);
+
+       return sprintf(buf, "%s\n", tz->governor->name);
+}
+
+#ifdef CONFIG_THERMAL_EMULATION
+static ssize_t
+emul_temp_store(struct device *dev, struct device_attribute *attr,
+                    const char *buf, size_t count)
+{
+       struct thermal_zone_device *tz = to_thermal_zone(dev);
+       int ret = 0;
+       unsigned long temperature;
+
+       if (kstrtoul(buf, 10, &temperature))
+               return -EINVAL;
+
+       if (!tz->ops->set_emul_temp) {
+               mutex_lock(&tz->lock);
+               tz->emul_temperature = temperature;
+               mutex_unlock(&tz->lock);
+       } else {
+               ret = tz->ops->set_emul_temp(tz, temperature);
+       }
+
+       return ret ? ret : count;
+}
+static DEVICE_ATTR(emul_temp, S_IWUSR, NULL, emul_temp_store);
+#endif/*CONFIG_THERMAL_EMULATION*/
+
+static DEVICE_ATTR(type, 0444, type_show, NULL);
+static DEVICE_ATTR(temp, 0444, temp_show, NULL);
+static DEVICE_ATTR(mode, 0644, mode_show, mode_store);
+static DEVICE_ATTR(passive, S_IRUGO | S_IWUSR, passive_show, passive_store);
+static DEVICE_ATTR(policy, S_IRUGO | S_IWUSR, policy_show, policy_store);
+
+/* sys I/F for cooling device */
+#define to_cooling_device(_dev)        \
+       container_of(_dev, struct thermal_cooling_device, device)
+
+static ssize_t
+thermal_cooling_device_type_show(struct device *dev,
+                                struct device_attribute *attr, char *buf)
+{
+       struct thermal_cooling_device *cdev = to_cooling_device(dev);
+
+       return sprintf(buf, "%s\n", cdev->type);
+}
+
+static ssize_t
+thermal_cooling_device_max_state_show(struct device *dev,
+                                     struct device_attribute *attr, char *buf)
+{
+       struct thermal_cooling_device *cdev = to_cooling_device(dev);
+       unsigned long state;
+       int ret;
+
+       ret = cdev->ops->get_max_state(cdev, &state);
+       if (ret)
+               return ret;
+       return sprintf(buf, "%ld\n", state);
+}
+
+static ssize_t
+thermal_cooling_device_cur_state_show(struct device *dev,
+                                     struct device_attribute *attr, char *buf)
+{
+       struct thermal_cooling_device *cdev = to_cooling_device(dev);
+       unsigned long state;
+       int ret;
+
+       ret = cdev->ops->get_cur_state(cdev, &state);
+       if (ret)
+               return ret;
+       return sprintf(buf, "%ld\n", state);
+}
+
+static ssize_t
+thermal_cooling_device_cur_state_store(struct device *dev,
+                                      struct device_attribute *attr,
+                                      const char *buf, size_t count)
+{
+       struct thermal_cooling_device *cdev = to_cooling_device(dev);
+       unsigned long state;
+       int result;
+
+       if (!sscanf(buf, "%ld\n", &state))
+               return -EINVAL;
+
+       if ((long)state < 0)
+               return -EINVAL;
+
+       result = cdev->ops->set_cur_state(cdev, state);
+       if (result)
+               return result;
+       return count;
+}
+
+static struct device_attribute dev_attr_cdev_type =
+__ATTR(type, 0444, thermal_cooling_device_type_show, NULL);
+static DEVICE_ATTR(max_state, 0444,
+                  thermal_cooling_device_max_state_show, NULL);
+static DEVICE_ATTR(cur_state, 0644,
+                  thermal_cooling_device_cur_state_show,
+                  thermal_cooling_device_cur_state_store);
+
+static ssize_t
+thermal_cooling_device_trip_point_show(struct device *dev,
+                                      struct device_attribute *attr, char *buf)
+{
+       struct thermal_instance *instance;
+
+       instance =
+           container_of(attr, struct thermal_instance, attr);
+
+       if (instance->trip == THERMAL_TRIPS_NONE)
+               return sprintf(buf, "-1\n");
+       else
+               return sprintf(buf, "%d\n", instance->trip);
+}
+
+/* Device management */
+
+#if defined(CONFIG_THERMAL_HWMON)
+
+/* hwmon sys I/F */
+#include <linux/hwmon.h>
+
+/* thermal zone devices with the same type share one hwmon device */
+struct thermal_hwmon_device {
+       char type[THERMAL_NAME_LENGTH];
+       struct device *device;
+       int count;
+       struct list_head tz_list;
+       struct list_head node;
+};
+
+struct thermal_hwmon_attr {
+       struct device_attribute attr;
+       char name[16];
+};
+
+/* one temperature input for each thermal zone */
+struct thermal_hwmon_temp {
+       struct list_head hwmon_node;
+       struct thermal_zone_device *tz;
+       struct thermal_hwmon_attr temp_input;   /* hwmon sys attr */
+       struct thermal_hwmon_attr temp_crit;    /* hwmon sys attr */
+};
+
+static LIST_HEAD(thermal_hwmon_list);
+
+static ssize_t
+name_show(struct device *dev, struct device_attribute *attr, char *buf)
+{
+       struct thermal_hwmon_device *hwmon = dev_get_drvdata(dev);
+       return sprintf(buf, "%s\n", hwmon->type);
+}
+static DEVICE_ATTR(name, 0444, name_show, NULL);
+
+static ssize_t
+temp_input_show(struct device *dev, struct device_attribute *attr, char *buf)
+{
+       long temperature;
+       int ret;
+       struct thermal_hwmon_attr *hwmon_attr
+                       = container_of(attr, struct thermal_hwmon_attr, attr);
+       struct thermal_hwmon_temp *temp
+                       = container_of(hwmon_attr, struct thermal_hwmon_temp,
+                                      temp_input);
+       struct thermal_zone_device *tz = temp->tz;
+
+       ret = thermal_zone_get_temp(tz, &temperature);
+
+       if (ret)
+               return ret;
+
+       return sprintf(buf, "%ld\n", temperature);
+}
+
+static ssize_t
+temp_crit_show(struct device *dev, struct device_attribute *attr,
+               char *buf)
+{
+       struct thermal_hwmon_attr *hwmon_attr
+                       = container_of(attr, struct thermal_hwmon_attr, attr);
+       struct thermal_hwmon_temp *temp
+                       = container_of(hwmon_attr, struct thermal_hwmon_temp,
+                                      temp_crit);
+       struct thermal_zone_device *tz = temp->tz;
+       long temperature;
+       int ret;
+
+       ret = tz->ops->get_trip_temp(tz, 0, &temperature);
+       if (ret)
+               return ret;
+
+       return sprintf(buf, "%ld\n", temperature);
+}
+
+
+static struct thermal_hwmon_device *
+thermal_hwmon_lookup_by_type(const struct thermal_zone_device *tz)
+{
+       struct thermal_hwmon_device *hwmon;
+
+       mutex_lock(&thermal_list_lock);
+       list_for_each_entry(hwmon, &thermal_hwmon_list, node)
+               if (!strcmp(hwmon->type, tz->type)) {
+                       mutex_unlock(&thermal_list_lock);
+                       return hwmon;
+               }
+       mutex_unlock(&thermal_list_lock);
+
+       return NULL;
+}
+
+/* Find the temperature input matching a given thermal zone */
+static struct thermal_hwmon_temp *
+thermal_hwmon_lookup_temp(const struct thermal_hwmon_device *hwmon,
+                         const struct thermal_zone_device *tz)
+{
+       struct thermal_hwmon_temp *temp;
+
+       mutex_lock(&thermal_list_lock);
+       list_for_each_entry(temp, &hwmon->tz_list, hwmon_node)
+               if (temp->tz == tz) {
+                       mutex_unlock(&thermal_list_lock);
+                       return temp;
+               }
+       mutex_unlock(&thermal_list_lock);
+
+       return NULL;
+}
+
+static int
+thermal_add_hwmon_sysfs(struct thermal_zone_device *tz)
+{
+       struct thermal_hwmon_device *hwmon;
+       struct thermal_hwmon_temp *temp;
+       int new_hwmon_device = 1;
+       int result;
+
+       hwmon = thermal_hwmon_lookup_by_type(tz);
+       if (hwmon) {
+               new_hwmon_device = 0;
+               goto register_sys_interface;
+       }
+
+       hwmon = kzalloc(sizeof(struct thermal_hwmon_device), GFP_KERNEL);
+       if (!hwmon)
+               return -ENOMEM;
+
+       INIT_LIST_HEAD(&hwmon->tz_list);
+       strlcpy(hwmon->type, tz->type, THERMAL_NAME_LENGTH);
+       hwmon->device = hwmon_device_register(NULL);
+       if (IS_ERR(hwmon->device)) {
+               result = PTR_ERR(hwmon->device);
+               goto free_mem;
+       }
+       dev_set_drvdata(hwmon->device, hwmon);
+       result = device_create_file(hwmon->device, &dev_attr_name);
+       if (result)
+               goto free_mem;
+
+ register_sys_interface:
+       temp = kzalloc(sizeof(struct thermal_hwmon_temp), GFP_KERNEL);
+       if (!temp) {
+               result = -ENOMEM;
+               goto unregister_name;
+       }
+
+       temp->tz = tz;
+       hwmon->count++;
+
+       snprintf(temp->temp_input.name, sizeof(temp->temp_input.name),
+                "temp%d_input", hwmon->count);
+       temp->temp_input.attr.attr.name = temp->temp_input.name;
+       temp->temp_input.attr.attr.mode = 0444;
+       temp->temp_input.attr.show = temp_input_show;
+       sysfs_attr_init(&temp->temp_input.attr.attr);
+       result = device_create_file(hwmon->device, &temp->temp_input.attr);
+       if (result)
+               goto free_temp_mem;
+
+       if (tz->ops->get_crit_temp) {
+               unsigned long temperature;
+               if (!tz->ops->get_crit_temp(tz, &temperature)) {
+                       snprintf(temp->temp_crit.name,
+                                sizeof(temp->temp_crit.name),
+                               "temp%d_crit", hwmon->count);
+                       temp->temp_crit.attr.attr.name = temp->temp_crit.name;
+                       temp->temp_crit.attr.attr.mode = 0444;
+                       temp->temp_crit.attr.show = temp_crit_show;
+                       sysfs_attr_init(&temp->temp_crit.attr.attr);
+                       result = device_create_file(hwmon->device,
+                                                   &temp->temp_crit.attr);
+                       if (result)
+                               goto unregister_input;
+               }
+       }
+
+       mutex_lock(&thermal_list_lock);
+       if (new_hwmon_device)
+               list_add_tail(&hwmon->node, &thermal_hwmon_list);
+       list_add_tail(&temp->hwmon_node, &hwmon->tz_list);
+       mutex_unlock(&thermal_list_lock);
+
+       return 0;
+
+ unregister_input:
+       device_remove_file(hwmon->device, &temp->temp_input.attr);
+ free_temp_mem:
+       kfree(temp);
+ unregister_name:
+       if (new_hwmon_device) {
+               device_remove_file(hwmon->device, &dev_attr_name);
+               hwmon_device_unregister(hwmon->device);
+       }
+ free_mem:
+       if (new_hwmon_device)
+               kfree(hwmon);
+
+       return result;
+}
+
+static void
+thermal_remove_hwmon_sysfs(struct thermal_zone_device *tz)
+{
+       struct thermal_hwmon_device *hwmon;
+       struct thermal_hwmon_temp *temp;
+
+       hwmon = thermal_hwmon_lookup_by_type(tz);
+       if (unlikely(!hwmon)) {
+               /* Should never happen... */
+               dev_dbg(&tz->device, "hwmon device lookup failed!\n");
+               return;
+       }
+
+       temp = thermal_hwmon_lookup_temp(hwmon, tz);
+       if (unlikely(!temp)) {
+               /* Should never happen... */
+               dev_dbg(&tz->device, "temperature input lookup failed!\n");
+               return;
+       }
+
+       device_remove_file(hwmon->device, &temp->temp_input.attr);
+       if (tz->ops->get_crit_temp)
+               device_remove_file(hwmon->device, &temp->temp_crit.attr);
+
+       mutex_lock(&thermal_list_lock);
+       list_del(&temp->hwmon_node);
+       kfree(temp);
+       if (!list_empty(&hwmon->tz_list)) {
+               mutex_unlock(&thermal_list_lock);
+               return;
+       }
+       list_del(&hwmon->node);
+       mutex_unlock(&thermal_list_lock);
+
+       device_remove_file(hwmon->device, &dev_attr_name);
+       hwmon_device_unregister(hwmon->device);
+       kfree(hwmon);
+}
+#else
+static int
+thermal_add_hwmon_sysfs(struct thermal_zone_device *tz)
+{
+       return 0;
+}
+
+static void
+thermal_remove_hwmon_sysfs(struct thermal_zone_device *tz)
+{
+}
+#endif
+
+/**
+ * thermal_zone_bind_cooling_device() - bind a cooling device to a thermal zone
+ * @tz:                pointer to struct thermal_zone_device
+ * @trip:      indicates which trip point the cooling devices is
+ *             associated with in this thermal zone.
+ * @cdev:      pointer to struct thermal_cooling_device
+ * @upper:     the Maximum cooling state for this trip point.
+ *             THERMAL_NO_LIMIT means no upper limit,
+ *             and the cooling device can be in max_state.
+ * @lower:     the Minimum cooling state can be used for this trip point.
+ *             THERMAL_NO_LIMIT means no lower limit,
+ *             and the cooling device can be in cooling state 0.
+ *
+ * This interface function bind a thermal cooling device to the certain trip
+ * point of a thermal zone device.
+ * This function is usually called in the thermal zone device .bind callback.
+ *
+ * Return: 0 on success, the proper error value otherwise.
+ */
+int thermal_zone_bind_cooling_device(struct thermal_zone_device *tz,
+                                    int trip,
+                                    struct thermal_cooling_device *cdev,
+                                    unsigned long upper, unsigned long lower)
+{
+       struct thermal_instance *dev;
+       struct thermal_instance *pos;
+       struct thermal_zone_device *pos1;
+       struct thermal_cooling_device *pos2;
+       unsigned long max_state;
+       int result;
+
+       if (trip >= tz->trips || (trip < 0 && trip != THERMAL_TRIPS_NONE))
+               return -EINVAL;
+
+       list_for_each_entry(pos1, &thermal_tz_list, node) {
+               if (pos1 == tz)
+                       break;
+       }
+       list_for_each_entry(pos2, &thermal_cdev_list, node) {
+               if (pos2 == cdev)
+                       break;
+       }
+
+       if (tz != pos1 || cdev != pos2)
+               return -EINVAL;
+
+       cdev->ops->get_max_state(cdev, &max_state);
+
+       /* lower default 0, upper default max_state */
+       lower = lower == THERMAL_NO_LIMIT ? 0 : lower;
+       upper = upper == THERMAL_NO_LIMIT ? max_state : upper;
+
+       if (lower > upper || upper > max_state)
+               return -EINVAL;
+
+       dev =
+           kzalloc(sizeof(struct thermal_instance), GFP_KERNEL);
+       if (!dev)
+               return -ENOMEM;
+       dev->tz = tz;
+       dev->cdev = cdev;
+       dev->trip = trip;
+       dev->upper = upper;
+       dev->lower = lower;
+       dev->target = THERMAL_NO_TARGET;
+
+       result = get_idr(&tz->idr, &tz->lock, &dev->id);
+       if (result)
+               goto free_mem;
+
+       sprintf(dev->name, "cdev%d", dev->id);
+       result =
+           sysfs_create_link(&tz->device.kobj, &cdev->device.kobj, dev->name);
+       if (result)
+               goto release_idr;
+
+       sprintf(dev->attr_name, "cdev%d_trip_point", dev->id);
+       sysfs_attr_init(&dev->attr.attr);
+       dev->attr.attr.name = dev->attr_name;
+       dev->attr.attr.mode = 0444;
+       dev->attr.show = thermal_cooling_device_trip_point_show;
+       result = device_create_file(&tz->device, &dev->attr);
+       if (result)
+               goto remove_symbol_link;
+
+       mutex_lock(&tz->lock);
+       mutex_lock(&cdev->lock);
+       list_for_each_entry(pos, &tz->thermal_instances, tz_node)
+           if (pos->tz == tz && pos->trip == trip && pos->cdev == cdev) {
+               result = -EEXIST;
+               break;
+       }
+       if (!result) {
+               list_add_tail(&dev->tz_node, &tz->thermal_instances);
+               list_add_tail(&dev->cdev_node, &cdev->thermal_instances);
+       }
+       mutex_unlock(&cdev->lock);
+       mutex_unlock(&tz->lock);
+
+       if (!result)
+               return 0;
+
+       device_remove_file(&tz->device, &dev->attr);
+remove_symbol_link:
+       sysfs_remove_link(&tz->device.kobj, dev->name);
+release_idr:
+       release_idr(&tz->idr, &tz->lock, dev->id);
+free_mem:
+       kfree(dev);
+       return result;
+}
+EXPORT_SYMBOL_GPL(thermal_zone_bind_cooling_device);
+
+/**
+ * thermal_zone_unbind_cooling_device() - unbind a cooling device from a
+ *                                       thermal zone.
+ * @tz:                pointer to a struct thermal_zone_device.
+ * @trip:      indicates which trip point the cooling devices is
+ *             associated with in this thermal zone.
+ * @cdev:      pointer to a struct thermal_cooling_device.
+ *
+ * This interface function unbind a thermal cooling device from the certain
+ * trip point of a thermal zone device.
+ * This function is usually called in the thermal zone device .unbind callback.
+ *
+ * Return: 0 on success, the proper error value otherwise.
+ */
+int thermal_zone_unbind_cooling_device(struct thermal_zone_device *tz,
+                                      int trip,
+                                      struct thermal_cooling_device *cdev)
+{
+       struct thermal_instance *pos, *next;
+
+       mutex_lock(&tz->lock);
+       mutex_lock(&cdev->lock);
+       list_for_each_entry_safe(pos, next, &tz->thermal_instances, tz_node) {
+               if (pos->tz == tz && pos->trip == trip && pos->cdev == cdev) {
+                       list_del(&pos->tz_node);
+                       list_del(&pos->cdev_node);
+                       mutex_unlock(&cdev->lock);
+                       mutex_unlock(&tz->lock);
+                       goto unbind;
+               }
+       }
+       mutex_unlock(&cdev->lock);
+       mutex_unlock(&tz->lock);
+
+       return -ENODEV;
+
+unbind:
+       device_remove_file(&tz->device, &pos->attr);
+       sysfs_remove_link(&tz->device.kobj, pos->name);
+       release_idr(&tz->idr, &tz->lock, pos->id);
+       kfree(pos);
+       return 0;
+}
+EXPORT_SYMBOL_GPL(thermal_zone_unbind_cooling_device);
+
+static void thermal_release(struct device *dev)
+{
+       struct thermal_zone_device *tz;
+       struct thermal_cooling_device *cdev;
+
+       if (!strncmp(dev_name(dev), "thermal_zone",
+                    sizeof("thermal_zone") - 1)) {
+               tz = to_thermal_zone(dev);
+               kfree(tz);
+       } else {
+               cdev = to_cooling_device(dev);
+               kfree(cdev);
+       }
+}
+
+static struct class thermal_class = {
+       .name = "thermal",
+       .dev_release = thermal_release,
+};
+
+/**
+ * thermal_cooling_device_register() - register a new thermal cooling device
+ * @type:      the thermal cooling device type.
+ * @devdata:   device private data.
+ * @ops:               standard thermal cooling devices callbacks.
+ *
+ * This interface function adds a new thermal cooling device (fan/processor/...)
+ * to /sys/class/thermal/ folder as cooling_device[0-*]. It tries to bind itself
+ * to all the thermal zone devices registered at the same time.
+ *
+ * Return: a pointer to the created struct thermal_cooling_device or an
+ * ERR_PTR. Caller must check return value with IS_ERR*() helpers.
+ */
+struct thermal_cooling_device *
+thermal_cooling_device_register(char *type, void *devdata,
+                               const struct thermal_cooling_device_ops *ops)
+{
+       struct thermal_cooling_device *cdev;
+       int result;
+
+       if (type && strlen(type) >= THERMAL_NAME_LENGTH)
+               return ERR_PTR(-EINVAL);
+
+       if (!ops || !ops->get_max_state || !ops->get_cur_state ||
+           !ops->set_cur_state)
+               return ERR_PTR(-EINVAL);
+
+       cdev = kzalloc(sizeof(struct thermal_cooling_device), GFP_KERNEL);
+       if (!cdev)
+               return ERR_PTR(-ENOMEM);
+
+       result = get_idr(&thermal_cdev_idr, &thermal_idr_lock, &cdev->id);
+       if (result) {
+               kfree(cdev);
+               return ERR_PTR(result);
+       }
+
+       strlcpy(cdev->type, type ? : "", sizeof(cdev->type));
+       mutex_init(&cdev->lock);
+       INIT_LIST_HEAD(&cdev->thermal_instances);
+       cdev->ops = ops;
+       cdev->updated = true;
+       cdev->device.class = &thermal_class;
+       cdev->devdata = devdata;
+       dev_set_name(&cdev->device, "cooling_device%d", cdev->id);
+       result = device_register(&cdev->device);
+       if (result) {
+               release_idr(&thermal_cdev_idr, &thermal_idr_lock, cdev->id);
+               kfree(cdev);
+               return ERR_PTR(result);
+       }
+
+       /* sys I/F */
+       if (type) {
+               result = device_create_file(&cdev->device, &dev_attr_cdev_type);
+               if (result)
+                       goto unregister;
+       }
+
+       result = device_create_file(&cdev->device, &dev_attr_max_state);
+       if (result)
+               goto unregister;
+
+       result = device_create_file(&cdev->device, &dev_attr_cur_state);
+       if (result)
+               goto unregister;
+
+       /* Add 'this' new cdev to the global cdev list */
+       mutex_lock(&thermal_list_lock);
+       list_add(&cdev->node, &thermal_cdev_list);
+       mutex_unlock(&thermal_list_lock);
+
+       /* Update binding information for 'this' new cdev */
+       bind_cdev(cdev);
+
+       return cdev;
+
+unregister:
+       release_idr(&thermal_cdev_idr, &thermal_idr_lock, cdev->id);
+       device_unregister(&cdev->device);
+       return ERR_PTR(result);
+}
+EXPORT_SYMBOL_GPL(thermal_cooling_device_register);
+
+/**
+ * thermal_cooling_device_unregister - removes the registered thermal cooling device
+ * @cdev:      the thermal cooling device to remove.
+ *
+ * thermal_cooling_device_unregister() must be called when the device is no
+ * longer needed.
+ */
+void thermal_cooling_device_unregister(struct thermal_cooling_device *cdev)
+{
+       int i;
+       const struct thermal_zone_params *tzp;
+       struct thermal_zone_device *tz;
+       struct thermal_cooling_device *pos = NULL;
+
+       if (!cdev)
+               return;
+
+       mutex_lock(&thermal_list_lock);
+       list_for_each_entry(pos, &thermal_cdev_list, node)
+           if (pos == cdev)
+               break;
+       if (pos != cdev) {
+               /* thermal cooling device not found */
+               mutex_unlock(&thermal_list_lock);
+               return;
+       }
+       list_del(&cdev->node);
+
+       /* Unbind all thermal zones associated with 'this' cdev */
+       list_for_each_entry(tz, &thermal_tz_list, node) {
+               if (tz->ops->unbind) {
+                       tz->ops->unbind(tz, cdev);
+                       continue;
+               }
+
+               if (!tz->tzp || !tz->tzp->tbp)
+                       continue;
+
+               tzp = tz->tzp;
+               for (i = 0; i < tzp->num_tbps; i++) {
+                       if (tzp->tbp[i].cdev == cdev) {
+                               __unbind(tz, tzp->tbp[i].trip_mask, cdev);
+                               tzp->tbp[i].cdev = NULL;
+                       }
+               }
+       }
+
+       mutex_unlock(&thermal_list_lock);
+
+       if (cdev->type[0])
+               device_remove_file(&cdev->device, &dev_attr_cdev_type);
+       device_remove_file(&cdev->device, &dev_attr_max_state);
+       device_remove_file(&cdev->device, &dev_attr_cur_state);
+
+       release_idr(&thermal_cdev_idr, &thermal_idr_lock, cdev->id);
+       device_unregister(&cdev->device);
+       return;
+}
+EXPORT_SYMBOL_GPL(thermal_cooling_device_unregister);
+
+void thermal_cdev_update(struct thermal_cooling_device *cdev)
+{
+       struct thermal_instance *instance;
+       unsigned long target = 0;
+
+       /* cooling device is updated*/
+       if (cdev->updated)
+               return;
+
+       mutex_lock(&cdev->lock);
+       /* Make sure cdev enters the deepest cooling state */
+       list_for_each_entry(instance, &cdev->thermal_instances, cdev_node) {
+               if (instance->target == THERMAL_NO_TARGET)
+                       continue;
+               if (instance->target > target)
+                       target = instance->target;
+       }
+       mutex_unlock(&cdev->lock);
+       cdev->ops->set_cur_state(cdev, target);
+       cdev->updated = true;
+}
+EXPORT_SYMBOL(thermal_cdev_update);
+
+/**
+ * thermal_notify_framework - Sensor drivers use this API to notify framework
+ * @tz:                thermal zone device
+ * @trip:      indicates which trip point has been crossed
+ *
+ * This function handles the trip events from sensor drivers. It starts
+ * throttling the cooling devices according to the policy configured.
+ * For CRITICAL and HOT trip points, this notifies the respective drivers,
+ * and does actual throttling for other trip points i.e ACTIVE and PASSIVE.
+ * The throttling policy is based on the configured platform data; if no
+ * platform data is provided, this uses the step_wise throttling policy.
+ */
+void thermal_notify_framework(struct thermal_zone_device *tz, int trip)
+{
+       handle_thermal_trip(tz, trip);
+}
+EXPORT_SYMBOL_GPL(thermal_notify_framework);
+
+/**
+ * create_trip_attrs() - create attributes for trip points
+ * @tz:                the thermal zone device
+ * @mask:      Writeable trip point bitmap.
+ *
+ * helper function to instantiate sysfs entries for every trip
+ * point and its properties of a struct thermal_zone_device.
+ *
+ * Return: 0 on success, the proper error value otherwise.
+ */
+static int create_trip_attrs(struct thermal_zone_device *tz, int mask)
+{
+       int indx;
+       int size = sizeof(struct thermal_attr) * tz->trips;
+
+       tz->trip_type_attrs = kzalloc(size, GFP_KERNEL);
+       if (!tz->trip_type_attrs)
+               return -ENOMEM;
+
+       tz->trip_temp_attrs = kzalloc(size, GFP_KERNEL);
+       if (!tz->trip_temp_attrs) {
+               kfree(tz->trip_type_attrs);
+               return -ENOMEM;
+       }
+
+       if (tz->ops->get_trip_hyst) {
+               tz->trip_hyst_attrs = kzalloc(size, GFP_KERNEL);
+               if (!tz->trip_hyst_attrs) {
+                       kfree(tz->trip_type_attrs);
+                       kfree(tz->trip_temp_attrs);
+                       return -ENOMEM;
+               }
+       }
+
+
+       for (indx = 0; indx < tz->trips; indx++) {
+               /* create trip type attribute */
+               snprintf(tz->trip_type_attrs[indx].name, THERMAL_NAME_LENGTH,
+                        "trip_point_%d_type", indx);
+
+               sysfs_attr_init(&tz->trip_type_attrs[indx].attr.attr);
+               tz->trip_type_attrs[indx].attr.attr.name =
+                                               tz->trip_type_attrs[indx].name;
+               tz->trip_type_attrs[indx].attr.attr.mode = S_IRUGO;
+               tz->trip_type_attrs[indx].attr.show = trip_point_type_show;
+
+               device_create_file(&tz->device,
+                                  &tz->trip_type_attrs[indx].attr);
+
+               /* create trip temp attribute */
+               snprintf(tz->trip_temp_attrs[indx].name, THERMAL_NAME_LENGTH,
+                        "trip_point_%d_temp", indx);
+
+               sysfs_attr_init(&tz->trip_temp_attrs[indx].attr.attr);
+               tz->trip_temp_attrs[indx].attr.attr.name =
+                                               tz->trip_temp_attrs[indx].name;
+               tz->trip_temp_attrs[indx].attr.attr.mode = S_IRUGO;
+               tz->trip_temp_attrs[indx].attr.show = trip_point_temp_show;
+               if (mask & (1 << indx)) {
+                       tz->trip_temp_attrs[indx].attr.attr.mode |= S_IWUSR;
+                       tz->trip_temp_attrs[indx].attr.store =
+                                                       trip_point_temp_store;
+               }
+
+               device_create_file(&tz->device,
+                                  &tz->trip_temp_attrs[indx].attr);
+
+               /* create Optional trip hyst attribute */
+               if (!tz->ops->get_trip_hyst)
+                       continue;
+               snprintf(tz->trip_hyst_attrs[indx].name, THERMAL_NAME_LENGTH,
+                        "trip_point_%d_hyst", indx);
+
+               sysfs_attr_init(&tz->trip_hyst_attrs[indx].attr.attr);
+               tz->trip_hyst_attrs[indx].attr.attr.name =
+                                       tz->trip_hyst_attrs[indx].name;
+               tz->trip_hyst_attrs[indx].attr.attr.mode = S_IRUGO;
+               tz->trip_hyst_attrs[indx].attr.show = trip_point_hyst_show;
+               if (tz->ops->set_trip_hyst) {
+                       tz->trip_hyst_attrs[indx].attr.attr.mode |= S_IWUSR;
+                       tz->trip_hyst_attrs[indx].attr.store =
+                                       trip_point_hyst_store;
+               }
+
+               device_create_file(&tz->device,
+                                  &tz->trip_hyst_attrs[indx].attr);
+       }
+       return 0;
+}
+
+static void remove_trip_attrs(struct thermal_zone_device *tz)
+{
+       int indx;
+
+       for (indx = 0; indx < tz->trips; indx++) {
+               device_remove_file(&tz->device,
+                                  &tz->trip_type_attrs[indx].attr);
+               device_remove_file(&tz->device,
+                                  &tz->trip_temp_attrs[indx].attr);
+               if (tz->ops->get_trip_hyst)
+                       device_remove_file(&tz->device,
+                                 &tz->trip_hyst_attrs[indx].attr);
+       }
+       kfree(tz->trip_type_attrs);
+       kfree(tz->trip_temp_attrs);
+       kfree(tz->trip_hyst_attrs);
+}
+
+/**
+ * thermal_zone_device_register() - register a new thermal zone device
+ * @type:      the thermal zone device type
+ * @trips:     the number of trip points the thermal zone support
+ * @mask:      a bit string indicating the writeablility of trip points
+ * @devdata:   private device data
+ * @ops:       standard thermal zone device callbacks
+ * @tzp:       thermal zone platform parameters
+ * @passive_delay: number of milliseconds to wait between polls when
+ *                performing passive cooling
+ * @polling_delay: number of milliseconds to wait between polls when checking
+ *                whether trip points have been crossed (0 for interrupt
+ *                driven systems)
+ *
+ * This interface function adds a new thermal zone device (sensor) to
+ * /sys/class/thermal folder as thermal_zone[0-*]. It tries to bind all the
+ * thermal cooling devices registered at the same time.
+ * thermal_zone_device_unregister() must be called when the device is no
+ * longer needed. The passive cooling depends on the .get_trend() return value.
+ *
+ * Return: a pointer to the created struct thermal_zone_device or an
+ * in case of error, an ERR_PTR. Caller must check return value with
+ * IS_ERR*() helpers.
+ */
+struct thermal_zone_device *thermal_zone_device_register(const char *type,
+       int trips, int mask, void *devdata,
+       const struct thermal_zone_device_ops *ops,
+       const struct thermal_zone_params *tzp,
+       int passive_delay, int polling_delay)
+{
+       struct thermal_zone_device *tz;
+       enum thermal_trip_type trip_type;
+       int result;
+       int count;
+       int passive = 0;
+
+       if (type && strlen(type) >= THERMAL_NAME_LENGTH)
+               return ERR_PTR(-EINVAL);
+
+       if (trips > THERMAL_MAX_TRIPS || trips < 0 || mask >> trips)
+               return ERR_PTR(-EINVAL);
+
+       if (!ops || !ops->get_temp)
+               return ERR_PTR(-EINVAL);
+
+       if (trips > 0 && !ops->get_trip_type)
+               return ERR_PTR(-EINVAL);
+
+       tz = kzalloc(sizeof(struct thermal_zone_device), GFP_KERNEL);
+       if (!tz)
+               return ERR_PTR(-ENOMEM);
+
+       INIT_LIST_HEAD(&tz->thermal_instances);
+       idr_init(&tz->idr);
+       mutex_init(&tz->lock);
+       result = get_idr(&thermal_tz_idr, &thermal_idr_lock, &tz->id);
+       if (result) {
+               kfree(tz);
+               return ERR_PTR(result);
+       }
+
+       strlcpy(tz->type, type ? : "", sizeof(tz->type));
+       tz->ops = ops;
+       tz->tzp = tzp;
+       tz->device.class = &thermal_class;
+       tz->devdata = devdata;
+       tz->trips = trips;
+       tz->passive_delay = passive_delay;
+       tz->polling_delay = polling_delay;
+
+       dev_set_name(&tz->device, "thermal_zone%d", tz->id);
+       result = device_register(&tz->device);
+       if (result) {
+               release_idr(&thermal_tz_idr, &thermal_idr_lock, tz->id);
+               kfree(tz);
+               return ERR_PTR(result);
+       }
+
+       /* sys I/F */
+       if (type) {
+               result = device_create_file(&tz->device, &dev_attr_type);
+               if (result)
+                       goto unregister;
+       }
+
+       result = device_create_file(&tz->device, &dev_attr_temp);
+       if (result)
+               goto unregister;
+
+       if (ops->get_mode) {
+               result = device_create_file(&tz->device, &dev_attr_mode);
+               if (result)
+                       goto unregister;
+       }
+
+       result = create_trip_attrs(tz, mask);
+       if (result)
+               goto unregister;
+
+       for (count = 0; count < trips; count++) {
+               tz->ops->get_trip_type(tz, count, &trip_type);
+               if (trip_type == THERMAL_TRIP_PASSIVE)
+                       passive = 1;
+       }
+
+       if (!passive) {
+               result = device_create_file(&tz->device, &dev_attr_passive);
+               if (result)
+                       goto unregister;
+       }
+
+#ifdef CONFIG_THERMAL_EMULATION
+       result = device_create_file(&tz->device, &dev_attr_emul_temp);
+       if (result)
+               goto unregister;
+#endif
+       /* Create policy attribute */
+       result = device_create_file(&tz->device, &dev_attr_policy);
+       if (result)
+               goto unregister;
+
+       /* Update 'this' zone's governor information */
+       mutex_lock(&thermal_governor_lock);
+
+       if (tz->tzp)
+               tz->governor = __find_governor(tz->tzp->governor_name);
+       else
+               tz->governor = __find_governor(DEFAULT_THERMAL_GOVERNOR);
+
+       mutex_unlock(&thermal_governor_lock);
+
+       result = thermal_add_hwmon_sysfs(tz);
+       if (result)
+               goto unregister;
+
+       mutex_lock(&thermal_list_lock);
+       list_add_tail(&tz->node, &thermal_tz_list);
+       mutex_unlock(&thermal_list_lock);
+
+       /* Bind cooling devices for this zone */
+       bind_tz(tz);
+
+       INIT_DELAYED_WORK(&(tz->poll_queue), thermal_zone_device_check);
+
+       thermal_zone_device_update(tz);
+
+       if (!result)
+               return tz;
+
+unregister:
+       release_idr(&thermal_tz_idr, &thermal_idr_lock, tz->id);
+       device_unregister(&tz->device);
+       return ERR_PTR(result);
+}
+EXPORT_SYMBOL_GPL(thermal_zone_device_register);
+
+/**
+ * thermal_device_unregister - removes the registered thermal zone device
+ * @tz: the thermal zone device to remove
+ */
+void thermal_zone_device_unregister(struct thermal_zone_device *tz)
+{
+       int i;
+       const struct thermal_zone_params *tzp;
+       struct thermal_cooling_device *cdev;
+       struct thermal_zone_device *pos = NULL;
+
+       if (!tz)
+               return;
+
+       tzp = tz->tzp;
+
+       mutex_lock(&thermal_list_lock);
+       list_for_each_entry(pos, &thermal_tz_list, node)
+           if (pos == tz)
+               break;
+       if (pos != tz) {
+               /* thermal zone device not found */
+               mutex_unlock(&thermal_list_lock);
+               return;
+       }
+       list_del(&tz->node);
+
+       /* Unbind all cdevs associated with 'this' thermal zone */
+       list_for_each_entry(cdev, &thermal_cdev_list, node) {
+               if (tz->ops->unbind) {
+                       tz->ops->unbind(tz, cdev);
+                       continue;
+               }
+
+               if (!tzp || !tzp->tbp)
+                       break;
+
+               for (i = 0; i < tzp->num_tbps; i++) {
+                       if (tzp->tbp[i].cdev == cdev) {
+                               __unbind(tz, tzp->tbp[i].trip_mask, cdev);
+                               tzp->tbp[i].cdev = NULL;
+                       }
+               }
+       }
+
+       mutex_unlock(&thermal_list_lock);
+
+       thermal_zone_device_set_polling(tz, 0);
+
+       if (tz->type[0])
+               device_remove_file(&tz->device, &dev_attr_type);
+       device_remove_file(&tz->device, &dev_attr_temp);
+       if (tz->ops->get_mode)
+               device_remove_file(&tz->device, &dev_attr_mode);
+       device_remove_file(&tz->device, &dev_attr_policy);
+       remove_trip_attrs(tz);
+       tz->governor = NULL;
+
+       thermal_remove_hwmon_sysfs(tz);
+       release_idr(&thermal_tz_idr, &thermal_idr_lock, tz->id);
+       idr_destroy(&tz->idr);
+       mutex_destroy(&tz->lock);
+       device_unregister(&tz->device);
+       return;
+}
+EXPORT_SYMBOL_GPL(thermal_zone_device_unregister);
+
+/**
+ * thermal_zone_get_zone_by_name() - search for a zone and returns its ref
+ * @name: thermal zone name to fetch the temperature
+ *
+ * When only one zone is found with the passed name, returns a reference to it.
+ *
+ * Return: On success returns a reference to an unique thermal zone with
+ * matching name equals to @name, an ERR_PTR otherwise (-EINVAL for invalid
+ * paramenters, -ENODEV for not found and -EEXIST for multiple matches).
+ */
+struct thermal_zone_device *thermal_zone_get_zone_by_name(const char *name)
+{
+       struct thermal_zone_device *pos = NULL, *ref = ERR_PTR(-EINVAL);
+       unsigned int found = 0;
+
+       if (!name)
+               goto exit;
+
+       mutex_lock(&thermal_list_lock);
+       list_for_each_entry(pos, &thermal_tz_list, node)
+               if (!strnicmp(name, pos->type, THERMAL_NAME_LENGTH)) {
+                       found++;
+                       ref = pos;
+               }
+       mutex_unlock(&thermal_list_lock);
+
+       /* nothing has been found, thus an error code for it */
+       if (found == 0)
+               ref = ERR_PTR(-ENODEV);
+       else if (found > 1)
+       /* Success only when an unique zone is found */
+               ref = ERR_PTR(-EEXIST);
+
+exit:
+       return ref;
+}
+EXPORT_SYMBOL_GPL(thermal_zone_get_zone_by_name);
+
+#ifdef CONFIG_NET
+static struct genl_family thermal_event_genl_family = {
+       .id = GENL_ID_GENERATE,
+       .name = THERMAL_GENL_FAMILY_NAME,
+       .version = THERMAL_GENL_VERSION,
+       .maxattr = THERMAL_GENL_ATTR_MAX,
+};
+
+static struct genl_multicast_group thermal_event_mcgrp = {
+       .name = THERMAL_GENL_MCAST_GROUP_NAME,
+};
+
+int thermal_generate_netlink_event(struct thermal_zone_device *tz,
+                                       enum events event)
+{
+       struct sk_buff *skb;
+       struct nlattr *attr;
+       struct thermal_genl_event *thermal_event;
+       void *msg_header;
+       int size;
+       int result;
+       static unsigned int thermal_event_seqnum;
+
+       if (!tz)
+               return -EINVAL;
+
+       /* allocate memory */
+       size = nla_total_size(sizeof(struct thermal_genl_event)) +
+              nla_total_size(0);
+
+       skb = genlmsg_new(size, GFP_ATOMIC);
+       if (!skb)
+               return -ENOMEM;
+
+       /* add the genetlink message header */
+       msg_header = genlmsg_put(skb, 0, thermal_event_seqnum++,
+                                &thermal_event_genl_family, 0,
+                                THERMAL_GENL_CMD_EVENT);
+       if (!msg_header) {
+               nlmsg_free(skb);
+               return -ENOMEM;
+       }
+
+       /* fill the data */
+       attr = nla_reserve(skb, THERMAL_GENL_ATTR_EVENT,
+                          sizeof(struct thermal_genl_event));
+
+       if (!attr) {
+               nlmsg_free(skb);
+               return -EINVAL;
+       }
+
+       thermal_event = nla_data(attr);
+       if (!thermal_event) {
+               nlmsg_free(skb);
+               return -EINVAL;
+       }
+
+       memset(thermal_event, 0, sizeof(struct thermal_genl_event));
+
+       thermal_event->orig = tz->id;
+       thermal_event->event = event;
+
+       /* send multicast genetlink message */
+       result = genlmsg_end(skb, msg_header);
+       if (result < 0) {
+               nlmsg_free(skb);
+               return result;
+       }
+
+       result = genlmsg_multicast(skb, 0, thermal_event_mcgrp.id, GFP_ATOMIC);
+       if (result)
+               dev_err(&tz->device, "Failed to send netlink event:%d", result);
+
+       return result;
+}
+EXPORT_SYMBOL_GPL(thermal_generate_netlink_event);
+
+static int genetlink_init(void)
+{
+       int result;
+
+       result = genl_register_family(&thermal_event_genl_family);
+       if (result)
+               return result;
+
+       result = genl_register_mc_group(&thermal_event_genl_family,
+                                       &thermal_event_mcgrp);
+       if (result)
+               genl_unregister_family(&thermal_event_genl_family);
+       return result;
+}
+
+static void genetlink_exit(void)
+{
+       genl_unregister_family(&thermal_event_genl_family);
+}
+#else /* !CONFIG_NET */
+static inline int genetlink_init(void) { return 0; }
+static inline void genetlink_exit(void) {}
+#endif /* !CONFIG_NET */
+
+static int __init thermal_register_governors(void)
+{
+       int result;
+
+       result = thermal_gov_step_wise_register();
+       if (result)
+               return result;
+
+       result = thermal_gov_fair_share_register();
+       if (result)
+               return result;
+
+       return thermal_gov_user_space_register();
+}
+
+static void thermal_unregister_governors(void)
+{
+       thermal_gov_step_wise_unregister();
+       thermal_gov_fair_share_unregister();
+       thermal_gov_user_space_unregister();
+}
+
+static int __init thermal_init(void)
+{
+       int result;
+
+       result = thermal_register_governors();
+       if (result)
+               goto error;
+
+       result = class_register(&thermal_class);
+       if (result)
+               goto unregister_governors;
+
+       result = genetlink_init();
+       if (result)
+               goto unregister_class;
+
+       return 0;
+
+unregister_governors:
+       thermal_unregister_governors();
+unregister_class:
+       class_unregister(&thermal_class);
+error:
+       idr_destroy(&thermal_tz_idr);
+       idr_destroy(&thermal_cdev_idr);
+       mutex_destroy(&thermal_idr_lock);
+       mutex_destroy(&thermal_list_lock);
+       mutex_destroy(&thermal_governor_lock);
+       return result;
+}
+
+static void __exit thermal_exit(void)
+{
+       genetlink_exit();
+       class_unregister(&thermal_class);
+       thermal_unregister_governors();
+       idr_destroy(&thermal_tz_idr);
+       idr_destroy(&thermal_cdev_idr);
+       mutex_destroy(&thermal_idr_lock);
+       mutex_destroy(&thermal_list_lock);
+       mutex_destroy(&thermal_governor_lock);
+}
+
+fs_initcall(thermal_init);
+module_exit(thermal_exit);
diff --git a/drivers/thermal/thermal_core.h b/drivers/thermal/thermal_core.h

index 0d3205a18112a00820fce0da168bf54aea1fa3a6..7cf2f66262517a0bfcc2727dee8232edcd2ec6d7 100644 (file)
--- a/drivers/thermal/thermal_core.h
+++ b/drivers/thermal/thermal_core.h
@@ -50,4 +50,31 @@ struct thermal_instance {
         struct list_head cdev_node; /* node in cdev->thermal_instances */
  };
  
+int thermal_register_governor(struct thermal_governor *);
+void thermal_unregister_governor(struct thermal_governor *);
+
+#ifdef CONFIG_THERMAL_GOV_STEP_WISE
+int thermal_gov_step_wise_register(void);
+void thermal_gov_step_wise_unregister(void);
+#else
+static inline int thermal_gov_step_wise_register(void) { return 0; }
+static inline void thermal_gov_step_wise_unregister(void) {}
+#endif /* CONFIG_THERMAL_GOV_STEP_WISE */
+
+#ifdef CONFIG_THERMAL_GOV_FAIR_SHARE
+int thermal_gov_fair_share_register(void);
+void thermal_gov_fair_share_unregister(void);
+#else
+static inline int thermal_gov_fair_share_register(void) { return 0; }
+static inline void thermal_gov_fair_share_unregister(void) {}
+#endif /* CONFIG_THERMAL_GOV_FAIR_SHARE */
+
+#ifdef CONFIG_THERMAL_GOV_USER_SPACE
+int thermal_gov_user_space_register(void);
+void thermal_gov_user_space_unregister(void);
+#else
+static inline int thermal_gov_user_space_register(void) { return 0; }
+static inline void thermal_gov_user_space_unregister(void) {}
+#endif /* CONFIG_THERMAL_GOV_USER_SPACE */
+
  #endif /* __THERMAL_CORE_H__ */
diff --git a/drivers/thermal/thermal_sys.c b/drivers/thermal/thermal_sys.c

deleted file mode 100644 (file)

index 5b7863a..0000000
--- a/drivers/thermal/thermal_sys.c
+++ /dev/null
@@ -1,1888 +0,0 @@
-/*
- *  thermal.c - Generic Thermal Management Sysfs support.
- *
- *  Copyright (C) 2008 Intel Corp
- *  Copyright (C) 2008 Zhang Rui <rui.zhang@intel.com>
- *  Copyright (C) 2008 Sujith Thomas <sujith.thomas@intel.com>
- *
- *  ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
- *
- *  This program is free software; you can redistribute it and/or modify
- *  it under the terms of the GNU General Public License as published by
- *  the Free Software Foundation; version 2 of the License.
- *
- *  This program is distributed in the hope that it will be useful, but
- *  WITHOUT ANY WARRANTY; without even the implied warranty of
- *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
- *  General Public License for more details.
- *
- *  You should have received a copy of the GNU General Public License along
- *  with this program; if not, write to the Free Software Foundation, Inc.,
- *  59 Temple Place, Suite 330, Boston, MA 02111-1307 USA.
- *
- * ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
- */
-
-#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
-
-#include <linux/module.h>
-#include <linux/device.h>
-#include <linux/err.h>
-#include <linux/slab.h>
-#include <linux/kdev_t.h>
-#include <linux/idr.h>
-#include <linux/thermal.h>
-#include <linux/reboot.h>
-#include <net/netlink.h>
-#include <net/genetlink.h>
-
-#include "thermal_core.h"
-
-MODULE_AUTHOR("Zhang Rui");
-MODULE_DESCRIPTION("Generic thermal management sysfs support");
-MODULE_LICENSE("GPL");
-
-static DEFINE_IDR(thermal_tz_idr);
-static DEFINE_IDR(thermal_cdev_idr);
-static DEFINE_MUTEX(thermal_idr_lock);
-
-static LIST_HEAD(thermal_tz_list);
-static LIST_HEAD(thermal_cdev_list);
-static LIST_HEAD(thermal_governor_list);
-
-static DEFINE_MUTEX(thermal_list_lock);
-static DEFINE_MUTEX(thermal_governor_lock);
-
-static struct thermal_governor *__find_governor(const char *name)
-{
-       struct thermal_governor *pos;
-
-       list_for_each_entry(pos, &thermal_governor_list, governor_list)
-               if (!strnicmp(name, pos->name, THERMAL_NAME_LENGTH))
-                       return pos;
-
-       return NULL;
-}
-
-int thermal_register_governor(struct thermal_governor *governor)
-{
-       int err;
-       const char *name;
-       struct thermal_zone_device *pos;
-
-       if (!governor)
-               return -EINVAL;
-
-       mutex_lock(&thermal_governor_lock);
-
-       err = -EBUSY;
-       if (__find_governor(governor->name) == NULL) {
-               err = 0;
-               list_add(&governor->governor_list, &thermal_governor_list);
-       }
-
-       mutex_lock(&thermal_list_lock);
-
-       list_for_each_entry(pos, &thermal_tz_list, node) {
-               if (pos->governor)
-                       continue;
-               if (pos->tzp)
-                       name = pos->tzp->governor_name;
-               else
-                       name = DEFAULT_THERMAL_GOVERNOR;
-               if (!strnicmp(name, governor->name, THERMAL_NAME_LENGTH))
-                       pos->governor = governor;
-       }
-
-       mutex_unlock(&thermal_list_lock);
-       mutex_unlock(&thermal_governor_lock);
-
-       return err;
-}
-EXPORT_SYMBOL_GPL(thermal_register_governor);
-
-void thermal_unregister_governor(struct thermal_governor *governor)
-{
-       struct thermal_zone_device *pos;
-
-       if (!governor)
-               return;
-
-       mutex_lock(&thermal_governor_lock);
-
-       if (__find_governor(governor->name) == NULL)
-               goto exit;
-
-       mutex_lock(&thermal_list_lock);
-
-       list_for_each_entry(pos, &thermal_tz_list, node) {
-               if (!strnicmp(pos->governor->name, governor->name,
-                                               THERMAL_NAME_LENGTH))
-                       pos->governor = NULL;
-       }
-
-       mutex_unlock(&thermal_list_lock);
-       list_del(&governor->governor_list);
-exit:
-       mutex_unlock(&thermal_governor_lock);
-       return;
-}
-EXPORT_SYMBOL_GPL(thermal_unregister_governor);
-
-static int get_idr(struct idr *idr, struct mutex *lock, int *id)
-{
-       int ret;
-
-       if (lock)
-               mutex_lock(lock);
-       ret = idr_alloc(idr, NULL, 0, 0, GFP_KERNEL);
-       if (lock)
-               mutex_unlock(lock);
-       if (unlikely(ret < 0))
-               return ret;
-       *id = ret;
-       return 0;
-}
-
-static void release_idr(struct idr *idr, struct mutex *lock, int id)
-{
-       if (lock)
-               mutex_lock(lock);
-       idr_remove(idr, id);
-       if (lock)
-               mutex_unlock(lock);
-}
-
-int get_tz_trend(struct thermal_zone_device *tz, int trip)
-{
-       enum thermal_trend trend;
-
-       if (!tz->ops->get_trend || tz->ops->get_trend(tz, trip, &trend)) {
-               if (tz->temperature > tz->last_temperature)
-                       trend = THERMAL_TREND_RAISING;
-               else if (tz->temperature < tz->last_temperature)
-                       trend = THERMAL_TREND_DROPPING;
-               else
-                       trend = THERMAL_TREND_STABLE;
-       }
-
-       return trend;
-}
-EXPORT_SYMBOL(get_tz_trend);
-
-struct thermal_instance *get_thermal_instance(struct thermal_zone_device *tz,
-                       struct thermal_cooling_device *cdev, int trip)
-{
-       struct thermal_instance *pos = NULL;
-       struct thermal_instance *target_instance = NULL;
-
-       mutex_lock(&tz->lock);
-       mutex_lock(&cdev->lock);
-
-       list_for_each_entry(pos, &tz->thermal_instances, tz_node) {
-               if (pos->tz == tz && pos->trip == trip && pos->cdev == cdev) {
-                       target_instance = pos;
-                       break;
-               }
-       }
-
-       mutex_unlock(&cdev->lock);
-       mutex_unlock(&tz->lock);
-
-       return target_instance;
-}
-EXPORT_SYMBOL(get_thermal_instance);
-
-static void print_bind_err_msg(struct thermal_zone_device *tz,
-                       struct thermal_cooling_device *cdev, int ret)
-{
-       dev_err(&tz->device, "binding zone %s with cdev %s failed:%d\n",
-                               tz->type, cdev->type, ret);
-}
-
-static void __bind(struct thermal_zone_device *tz, int mask,
-                       struct thermal_cooling_device *cdev)
-{
-       int i, ret;
-
-       for (i = 0; i < tz->trips; i++) {
-               if (mask & (1 << i)) {
-                       ret = thermal_zone_bind_cooling_device(tz, i, cdev,
-                                       THERMAL_NO_LIMIT, THERMAL_NO_LIMIT);
-                       if (ret)
-                               print_bind_err_msg(tz, cdev, ret);
-               }
-       }
-}
-
-static void __unbind(struct thermal_zone_device *tz, int mask,
-                       struct thermal_cooling_device *cdev)
-{
-       int i;
-
-       for (i = 0; i < tz->trips; i++)
-               if (mask & (1 << i))
-                       thermal_zone_unbind_cooling_device(tz, i, cdev);
-}
-
-static void bind_cdev(struct thermal_cooling_device *cdev)
-{
-       int i, ret;
-       const struct thermal_zone_params *tzp;
-       struct thermal_zone_device *pos = NULL;
-
-       mutex_lock(&thermal_list_lock);
-
-       list_for_each_entry(pos, &thermal_tz_list, node) {
-               if (!pos->tzp && !pos->ops->bind)
-                       continue;
-
-               if (!pos->tzp && pos->ops->bind) {
-                       ret = pos->ops->bind(pos, cdev);
-                       if (ret)
-                               print_bind_err_msg(pos, cdev, ret);
-               }
-
-               tzp = pos->tzp;
-               if (!tzp || !tzp->tbp)
-                       continue;
-
-               for (i = 0; i < tzp->num_tbps; i++) {
-                       if (tzp->tbp[i].cdev || !tzp->tbp[i].match)
-                               continue;
-                       if (tzp->tbp[i].match(pos, cdev))
-                               continue;
-                       tzp->tbp[i].cdev = cdev;
-                       __bind(pos, tzp->tbp[i].trip_mask, cdev);
-               }
-       }
-
-       mutex_unlock(&thermal_list_lock);
-}
-
-static void bind_tz(struct thermal_zone_device *tz)
-{
-       int i, ret;
-       struct thermal_cooling_device *pos = NULL;
-       const struct thermal_zone_params *tzp = tz->tzp;
-
-       if (!tzp && !tz->ops->bind)
-               return;
-
-       mutex_lock(&thermal_list_lock);
-
-       /* If there is no platform data, try to use ops->bind */
-       if (!tzp && tz->ops->bind) {
-               list_for_each_entry(pos, &thermal_cdev_list, node) {
-                       ret = tz->ops->bind(tz, pos);
-                       if (ret)
-                               print_bind_err_msg(tz, pos, ret);
-               }
-               goto exit;
-       }
-
-       if (!tzp || !tzp->tbp)
-               goto exit;
-
-       list_for_each_entry(pos, &thermal_cdev_list, node) {
-               for (i = 0; i < tzp->num_tbps; i++) {
-                       if (tzp->tbp[i].cdev || !tzp->tbp[i].match)
-                               continue;
-                       if (tzp->tbp[i].match(tz, pos))
-                               continue;
-                       tzp->tbp[i].cdev = pos;
-                       __bind(tz, tzp->tbp[i].trip_mask, pos);
-               }
-       }
-exit:
-       mutex_unlock(&thermal_list_lock);
-}
-
-static void thermal_zone_device_set_polling(struct thermal_zone_device *tz,
-                                           int delay)
-{
-       if (delay > 1000)
-               mod_delayed_work(system_freezable_wq, &tz->poll_queue,
-                                round_jiffies(msecs_to_jiffies(delay)));
-       else if (delay)
-               mod_delayed_work(system_freezable_wq, &tz->poll_queue,
-                                msecs_to_jiffies(delay));
-       else
-               cancel_delayed_work(&tz->poll_queue);
-}
-
-static void monitor_thermal_zone(struct thermal_zone_device *tz)
-{
-       mutex_lock(&tz->lock);
-
-       if (tz->passive)
-               thermal_zone_device_set_polling(tz, tz->passive_delay);
-       else if (tz->polling_delay)
-               thermal_zone_device_set_polling(tz, tz->polling_delay);
-       else
-               thermal_zone_device_set_polling(tz, 0);
-
-       mutex_unlock(&tz->lock);
-}
-
-static void handle_non_critical_trips(struct thermal_zone_device *tz,
-                       int trip, enum thermal_trip_type trip_type)
-{
-       if (tz->governor)
-               tz->governor->throttle(tz, trip);
-}
-
-static void handle_critical_trips(struct thermal_zone_device *tz,
-                               int trip, enum thermal_trip_type trip_type)
-{
-       long trip_temp;
-
-       tz->ops->get_trip_temp(tz, trip, &trip_temp);
-
-       /* If we have not crossed the trip_temp, we do not care. */
-       if (tz->temperature < trip_temp)
-               return;
-
-       if (tz->ops->notify)
-               tz->ops->notify(tz, trip, trip_type);
-
-       if (trip_type == THERMAL_TRIP_CRITICAL) {
-               dev_emerg(&tz->device,
-                         "critical temperature reached(%d C),shutting down\n",
-                         tz->temperature / 1000);
-               orderly_poweroff(true);
-       }
-}
-
-static void handle_thermal_trip(struct thermal_zone_device *tz, int trip)
-{
-       enum thermal_trip_type type;
-
-       tz->ops->get_trip_type(tz, trip, &type);
-
-       if (type == THERMAL_TRIP_CRITICAL || type == THERMAL_TRIP_HOT)
-               handle_critical_trips(tz, trip, type);
-       else
-               handle_non_critical_trips(tz, trip, type);
-       /*
-        * Alright, we handled this trip successfully.
-        * So, start monitoring again.
-        */
-       monitor_thermal_zone(tz);
-}
-
-static int thermal_zone_get_temp(struct thermal_zone_device *tz,
-                               unsigned long *temp)
-{
-       int ret = 0;
-#ifdef CONFIG_THERMAL_EMULATION
-       int count;
-       unsigned long crit_temp = -1UL;
-       enum thermal_trip_type type;
-#endif
-
-       mutex_lock(&tz->lock);
-
-       ret = tz->ops->get_temp(tz, temp);
-#ifdef CONFIG_THERMAL_EMULATION
-       if (!tz->emul_temperature)
-               goto skip_emul;
-
-       for (count = 0; count < tz->trips; count++) {
-               ret = tz->ops->get_trip_type(tz, count, &type);
-               if (!ret && type == THERMAL_TRIP_CRITICAL) {
-                       ret = tz->ops->get_trip_temp(tz, count, &crit_temp);
-                       break;
-               }
-       }
-
-       if (ret)
-               goto skip_emul;
-
-       if (*temp < crit_temp)
-               *temp = tz->emul_temperature;
-skip_emul:
-#endif
-       mutex_unlock(&tz->lock);
-       return ret;
-}
-
-static void update_temperature(struct thermal_zone_device *tz)
-{
-       long temp;
-       int ret;
-
-       ret = thermal_zone_get_temp(tz, &temp);
-       if (ret) {
-               dev_warn(&tz->device, "failed to read out thermal zone %d\n",
-                        tz->id);
-               return;
-       }
-
-       mutex_lock(&tz->lock);
-       tz->last_temperature = tz->temperature;
-       tz->temperature = temp;
-       mutex_unlock(&tz->lock);
-}
-
-void thermal_zone_device_update(struct thermal_zone_device *tz)
-{
-       int count;
-
-       update_temperature(tz);
-
-       for (count = 0; count < tz->trips; count++)
-               handle_thermal_trip(tz, count);
-}
-EXPORT_SYMBOL(thermal_zone_device_update);
-
-static void thermal_zone_device_check(struct work_struct *work)
-{
-       struct thermal_zone_device *tz = container_of(work, struct
-                                                     thermal_zone_device,
-                                                     poll_queue.work);
-       thermal_zone_device_update(tz);
-}
-
-/* sys I/F for thermal zone */
-
-#define to_thermal_zone(_dev) \
-       container_of(_dev, struct thermal_zone_device, device)
-
-static ssize_t
-type_show(struct device *dev, struct device_attribute *attr, char *buf)
-{
-       struct thermal_zone_device *tz = to_thermal_zone(dev);
-
-       return sprintf(buf, "%s\n", tz->type);
-}
-
-static ssize_t
-temp_show(struct device *dev, struct device_attribute *attr, char *buf)
-{
-       struct thermal_zone_device *tz = to_thermal_zone(dev);
-       long temperature;
-       int ret;
-
-       ret = thermal_zone_get_temp(tz, &temperature);
-
-       if (ret)
-               return ret;
-
-       return sprintf(buf, "%ld\n", temperature);
-}
-
-static ssize_t
-mode_show(struct device *dev, struct device_attribute *attr, char *buf)
-{
-       struct thermal_zone_device *tz = to_thermal_zone(dev);
-       enum thermal_device_mode mode;
-       int result;
-
-       if (!tz->ops->get_mode)
-               return -EPERM;
-
-       result = tz->ops->get_mode(tz, &mode);
-       if (result)
-               return result;
-
-       return sprintf(buf, "%s\n", mode == THERMAL_DEVICE_ENABLED ? "enabled"
-                      : "disabled");
-}
-
-static ssize_t
-mode_store(struct device *dev, struct device_attribute *attr,
-          const char *buf, size_t count)
-{
-       struct thermal_zone_device *tz = to_thermal_zone(dev);
-       int result;
-
-       if (!tz->ops->set_mode)
-               return -EPERM;
-
-       if (!strncmp(buf, "enabled", sizeof("enabled") - 1))
-               result = tz->ops->set_mode(tz, THERMAL_DEVICE_ENABLED);
-       else if (!strncmp(buf, "disabled", sizeof("disabled") - 1))
-               result = tz->ops->set_mode(tz, THERMAL_DEVICE_DISABLED);
-       else
-               result = -EINVAL;
-
-       if (result)
-               return result;
-
-       return count;
-}
-
-static ssize_t
-trip_point_type_show(struct device *dev, struct device_attribute *attr,
-                    char *buf)
-{
-       struct thermal_zone_device *tz = to_thermal_zone(dev);
-       enum thermal_trip_type type;
-       int trip, result;
-
-       if (!tz->ops->get_trip_type)
-               return -EPERM;
-
-       if (!sscanf(attr->attr.name, "trip_point_%d_type", &trip))
-               return -EINVAL;
-
-       result = tz->ops->get_trip_type(tz, trip, &type);
-       if (result)
-               return result;
-
-       switch (type) {
-       case THERMAL_TRIP_CRITICAL:
-               return sprintf(buf, "critical\n");
-       case THERMAL_TRIP_HOT:
-               return sprintf(buf, "hot\n");
-       case THERMAL_TRIP_PASSIVE:
-               return sprintf(buf, "passive\n");
-       case THERMAL_TRIP_ACTIVE:
-               return sprintf(buf, "active\n");
-       default:
-               return sprintf(buf, "unknown\n");
-       }
-}
-
-static ssize_t
-trip_point_temp_store(struct device *dev, struct device_attribute *attr,
-                    const char *buf, size_t count)
-{
-       struct thermal_zone_device *tz = to_thermal_zone(dev);
-       int trip, ret;
-       unsigned long temperature;
-
-       if (!tz->ops->set_trip_temp)
-               return -EPERM;
-
-       if (!sscanf(attr->attr.name, "trip_point_%d_temp", &trip))
-               return -EINVAL;
-
-       if (kstrtoul(buf, 10, &temperature))
-               return -EINVAL;
-
-       ret = tz->ops->set_trip_temp(tz, trip, temperature);
-
-       return ret ? ret : count;
-}
-
-static ssize_t
-trip_point_temp_show(struct device *dev, struct device_attribute *attr,
-                    char *buf)
-{
-       struct thermal_zone_device *tz = to_thermal_zone(dev);
-       int trip, ret;
-       long temperature;
-
-       if (!tz->ops->get_trip_temp)
-               return -EPERM;
-
-       if (!sscanf(attr->attr.name, "trip_point_%d_temp", &trip))
-               return -EINVAL;
-
-       ret = tz->ops->get_trip_temp(tz, trip, &temperature);
-
-       if (ret)
-               return ret;
-
-       return sprintf(buf, "%ld\n", temperature);
-}
-
-static ssize_t
-trip_point_hyst_store(struct device *dev, struct device_attribute *attr,
-                       const char *buf, size_t count)
-{
-       struct thermal_zone_device *tz = to_thermal_zone(dev);
-       int trip, ret;
-       unsigned long temperature;
-
-       if (!tz->ops->set_trip_hyst)
-               return -EPERM;
-
-       if (!sscanf(attr->attr.name, "trip_point_%d_hyst", &trip))
-               return -EINVAL;
-
-       if (kstrtoul(buf, 10, &temperature))
-               return -EINVAL;
-
-       /*
-        * We are not doing any check on the 'temperature' value
-        * here. The driver implementing 'set_trip_hyst' has to
-        * take care of this.
-        */
-       ret = tz->ops->set_trip_hyst(tz, trip, temperature);
-
-       return ret ? ret : count;
-}
-
-static ssize_t
-trip_point_hyst_show(struct device *dev, struct device_attribute *attr,
-                       char *buf)
-{
-       struct thermal_zone_device *tz = to_thermal_zone(dev);
-       int trip, ret;
-       unsigned long temperature;
-
-       if (!tz->ops->get_trip_hyst)
-               return -EPERM;
-
-       if (!sscanf(attr->attr.name, "trip_point_%d_hyst", &trip))
-               return -EINVAL;
-
-       ret = tz->ops->get_trip_hyst(tz, trip, &temperature);
-
-       return ret ? ret : sprintf(buf, "%ld\n", temperature);
-}
-
-static ssize_t
-passive_store(struct device *dev, struct device_attribute *attr,
-                   const char *buf, size_t count)
-{
-       struct thermal_zone_device *tz = to_thermal_zone(dev);
-       struct thermal_cooling_device *cdev = NULL;
-       int state;
-
-       if (!sscanf(buf, "%d\n", &state))
-               return -EINVAL;
-
-       /* sanity check: values below 1000 millicelcius don't make sense
-        * and can cause the system to go into a thermal heart attack
-        */
-       if (state && state < 1000)
-               return -EINVAL;
-
-       if (state && !tz->forced_passive) {
-               mutex_lock(&thermal_list_lock);
-               list_for_each_entry(cdev, &thermal_cdev_list, node) {
-                       if (!strncmp("Processor", cdev->type,
-                                    sizeof("Processor")))
-                               thermal_zone_bind_cooling_device(tz,
-                                               THERMAL_TRIPS_NONE, cdev,
-                                               THERMAL_NO_LIMIT,
-                                               THERMAL_NO_LIMIT);
-               }
-               mutex_unlock(&thermal_list_lock);
-               if (!tz->passive_delay)
-                       tz->passive_delay = 1000;
-       } else if (!state && tz->forced_passive) {
-               mutex_lock(&thermal_list_lock);
-               list_for_each_entry(cdev, &thermal_cdev_list, node) {
-                       if (!strncmp("Processor", cdev->type,
-                                    sizeof("Processor")))
-                               thermal_zone_unbind_cooling_device(tz,
-                                                                  THERMAL_TRIPS_NONE,
-                                                                  cdev);
-               }
-               mutex_unlock(&thermal_list_lock);
-               tz->passive_delay = 0;
-       }
-
-       tz->forced_passive = state;
-
-       thermal_zone_device_update(tz);
-
-       return count;
-}
-
-static ssize_t
-passive_show(struct device *dev, struct device_attribute *attr,
-                  char *buf)
-{
-       struct thermal_zone_device *tz = to_thermal_zone(dev);
-
-       return sprintf(buf, "%d\n", tz->forced_passive);
-}
-
-static ssize_t
-policy_store(struct device *dev, struct device_attribute *attr,
-                   const char *buf, size_t count)
-{
-       int ret = -EINVAL;
-       struct thermal_zone_device *tz = to_thermal_zone(dev);
-       struct thermal_governor *gov;
-
-       mutex_lock(&thermal_governor_lock);
-
-       gov = __find_governor(buf);
-       if (!gov)
-               goto exit;
-
-       tz->governor = gov;
-       ret = count;
-
-exit:
-       mutex_unlock(&thermal_governor_lock);
-       return ret;
-}
-
-static ssize_t
-policy_show(struct device *dev, struct device_attribute *devattr, char *buf)
-{
-       struct thermal_zone_device *tz = to_thermal_zone(dev);
-
-       return sprintf(buf, "%s\n", tz->governor->name);
-}
-
-#ifdef CONFIG_THERMAL_EMULATION
-static ssize_t
-emul_temp_store(struct device *dev, struct device_attribute *attr,
-                    const char *buf, size_t count)
-{
-       struct thermal_zone_device *tz = to_thermal_zone(dev);
-       int ret = 0;
-       unsigned long temperature;
-
-       if (kstrtoul(buf, 10, &temperature))
-               return -EINVAL;
-
-       if (!tz->ops->set_emul_temp) {
-               mutex_lock(&tz->lock);
-               tz->emul_temperature = temperature;
-               mutex_unlock(&tz->lock);
-       } else {
-               ret = tz->ops->set_emul_temp(tz, temperature);
-       }
-
-       return ret ? ret : count;
-}
-static DEVICE_ATTR(emul_temp, S_IWUSR, NULL, emul_temp_store);
-#endif/*CONFIG_THERMAL_EMULATION*/
-
-static DEVICE_ATTR(type, 0444, type_show, NULL);
-static DEVICE_ATTR(temp, 0444, temp_show, NULL);
-static DEVICE_ATTR(mode, 0644, mode_show, mode_store);
-static DEVICE_ATTR(passive, S_IRUGO | S_IWUSR, passive_show, passive_store);
-static DEVICE_ATTR(policy, S_IRUGO | S_IWUSR, policy_show, policy_store);
-
-/* sys I/F for cooling device */
-#define to_cooling_device(_dev)        \
-       container_of(_dev, struct thermal_cooling_device, device)
-
-static ssize_t
-thermal_cooling_device_type_show(struct device *dev,
-                                struct device_attribute *attr, char *buf)
-{
-       struct thermal_cooling_device *cdev = to_cooling_device(dev);
-
-       return sprintf(buf, "%s\n", cdev->type);
-}
-
-static ssize_t
-thermal_cooling_device_max_state_show(struct device *dev,
-                                     struct device_attribute *attr, char *buf)
-{
-       struct thermal_cooling_device *cdev = to_cooling_device(dev);
-       unsigned long state;
-       int ret;
-
-       ret = cdev->ops->get_max_state(cdev, &state);
-       if (ret)
-               return ret;
-       return sprintf(buf, "%ld\n", state);
-}
-
-static ssize_t
-thermal_cooling_device_cur_state_show(struct device *dev,
-                                     struct device_attribute *attr, char *buf)
-{
-       struct thermal_cooling_device *cdev = to_cooling_device(dev);
-       unsigned long state;
-       int ret;
-
-       ret = cdev->ops->get_cur_state(cdev, &state);
-       if (ret)
-               return ret;
-       return sprintf(buf, "%ld\n", state);
-}
-
-static ssize_t
-thermal_cooling_device_cur_state_store(struct device *dev,
-                                      struct device_attribute *attr,
-                                      const char *buf, size_t count)
-{
-       struct thermal_cooling_device *cdev = to_cooling_device(dev);
-       unsigned long state;
-       int result;
-
-       if (!sscanf(buf, "%ld\n", &state))
-               return -EINVAL;
-
-       if ((long)state < 0)
-               return -EINVAL;
-
-       result = cdev->ops->set_cur_state(cdev, state);
-       if (result)
-               return result;
-       return count;
-}
-
-static struct device_attribute dev_attr_cdev_type =
-__ATTR(type, 0444, thermal_cooling_device_type_show, NULL);
-static DEVICE_ATTR(max_state, 0444,
-                  thermal_cooling_device_max_state_show, NULL);
-static DEVICE_ATTR(cur_state, 0644,
-                  thermal_cooling_device_cur_state_show,
-                  thermal_cooling_device_cur_state_store);
-
-static ssize_t
-thermal_cooling_device_trip_point_show(struct device *dev,
-                                      struct device_attribute *attr, char *buf)
-{
-       struct thermal_instance *instance;
-
-       instance =
-           container_of(attr, struct thermal_instance, attr);
-
-       if (instance->trip == THERMAL_TRIPS_NONE)
-               return sprintf(buf, "-1\n");
-       else
-               return sprintf(buf, "%d\n", instance->trip);
-}
-
-/* Device management */
-
-#if defined(CONFIG_THERMAL_HWMON)
-
-/* hwmon sys I/F */
-#include <linux/hwmon.h>
-
-/* thermal zone devices with the same type share one hwmon device */
-struct thermal_hwmon_device {
-       char type[THERMAL_NAME_LENGTH];
-       struct device *device;
-       int count;
-       struct list_head tz_list;
-       struct list_head node;
-};
-
-struct thermal_hwmon_attr {
-       struct device_attribute attr;
-       char name[16];
-};
-
-/* one temperature input for each thermal zone */
-struct thermal_hwmon_temp {
-       struct list_head hwmon_node;
-       struct thermal_zone_device *tz;
-       struct thermal_hwmon_attr temp_input;   /* hwmon sys attr */
-       struct thermal_hwmon_attr temp_crit;    /* hwmon sys attr */
-};
-
-static LIST_HEAD(thermal_hwmon_list);
-
-static ssize_t
-name_show(struct device *dev, struct device_attribute *attr, char *buf)
-{
-       struct thermal_hwmon_device *hwmon = dev_get_drvdata(dev);
-       return sprintf(buf, "%s\n", hwmon->type);
-}
-static DEVICE_ATTR(name, 0444, name_show, NULL);
-
-static ssize_t
-temp_input_show(struct device *dev, struct device_attribute *attr, char *buf)
-{
-       long temperature;
-       int ret;
-       struct thermal_hwmon_attr *hwmon_attr
-                       = container_of(attr, struct thermal_hwmon_attr, attr);
-       struct thermal_hwmon_temp *temp
-                       = container_of(hwmon_attr, struct thermal_hwmon_temp,
-                                      temp_input);
-       struct thermal_zone_device *tz = temp->tz;
-
-       ret = thermal_zone_get_temp(tz, &temperature);
-
-       if (ret)
-               return ret;
-
-       return sprintf(buf, "%ld\n", temperature);
-}
-
-static ssize_t
-temp_crit_show(struct device *dev, struct device_attribute *attr,
-               char *buf)
-{
-       struct thermal_hwmon_attr *hwmon_attr
-                       = container_of(attr, struct thermal_hwmon_attr, attr);
-       struct thermal_hwmon_temp *temp
-                       = container_of(hwmon_attr, struct thermal_hwmon_temp,
-                                      temp_crit);
-       struct thermal_zone_device *tz = temp->tz;
-       long temperature;
-       int ret;
-
-       ret = tz->ops->get_trip_temp(tz, 0, &temperature);
-       if (ret)
-               return ret;
-
-       return sprintf(buf, "%ld\n", temperature);
-}
-
-
-static struct thermal_hwmon_device *
-thermal_hwmon_lookup_by_type(const struct thermal_zone_device *tz)
-{
-       struct thermal_hwmon_device *hwmon;
-
-       mutex_lock(&thermal_list_lock);
-       list_for_each_entry(hwmon, &thermal_hwmon_list, node)
-               if (!strcmp(hwmon->type, tz->type)) {
-                       mutex_unlock(&thermal_list_lock);
-                       return hwmon;
-               }
-       mutex_unlock(&thermal_list_lock);
-
-       return NULL;
-}
-
-/* Find the temperature input matching a given thermal zone */
-static struct thermal_hwmon_temp *
-thermal_hwmon_lookup_temp(const struct thermal_hwmon_device *hwmon,
-                         const struct thermal_zone_device *tz)
-{
-       struct thermal_hwmon_temp *temp;
-
-       mutex_lock(&thermal_list_lock);
-       list_for_each_entry(temp, &hwmon->tz_list, hwmon_node)
-               if (temp->tz == tz) {
-                       mutex_unlock(&thermal_list_lock);
-                       return temp;
-               }
-       mutex_unlock(&thermal_list_lock);
-
-       return NULL;
-}
-
-static int
-thermal_add_hwmon_sysfs(struct thermal_zone_device *tz)
-{
-       struct thermal_hwmon_device *hwmon;
-       struct thermal_hwmon_temp *temp;
-       int new_hwmon_device = 1;
-       int result;
-
-       hwmon = thermal_hwmon_lookup_by_type(tz);
-       if (hwmon) {
-               new_hwmon_device = 0;
-               goto register_sys_interface;
-       }
-
-       hwmon = kzalloc(sizeof(struct thermal_hwmon_device), GFP_KERNEL);
-       if (!hwmon)
-               return -ENOMEM;
-
-       INIT_LIST_HEAD(&hwmon->tz_list);
-       strlcpy(hwmon->type, tz->type, THERMAL_NAME_LENGTH);
-       hwmon->device = hwmon_device_register(NULL);
-       if (IS_ERR(hwmon->device)) {
-               result = PTR_ERR(hwmon->device);
-               goto free_mem;
-       }
-       dev_set_drvdata(hwmon->device, hwmon);
-       result = device_create_file(hwmon->device, &dev_attr_name);
-       if (result)
-               goto free_mem;
-
- register_sys_interface:
-       temp = kzalloc(sizeof(struct thermal_hwmon_temp), GFP_KERNEL);
-       if (!temp) {
-               result = -ENOMEM;
-               goto unregister_name;
-       }
-
-       temp->tz = tz;
-       hwmon->count++;
-
-       snprintf(temp->temp_input.name, sizeof(temp->temp_input.name),
-                "temp%d_input", hwmon->count);
-       temp->temp_input.attr.attr.name = temp->temp_input.name;
-       temp->temp_input.attr.attr.mode = 0444;
-       temp->temp_input.attr.show = temp_input_show;
-       sysfs_attr_init(&temp->temp_input.attr.attr);
-       result = device_create_file(hwmon->device, &temp->temp_input.attr);
-       if (result)
-               goto free_temp_mem;
-
-       if (tz->ops->get_crit_temp) {
-               unsigned long temperature;
-               if (!tz->ops->get_crit_temp(tz, &temperature)) {
-                       snprintf(temp->temp_crit.name,
-                                sizeof(temp->temp_crit.name),
-                               "temp%d_crit", hwmon->count);
-                       temp->temp_crit.attr.attr.name = temp->temp_crit.name;
-                       temp->temp_crit.attr.attr.mode = 0444;
-                       temp->temp_crit.attr.show = temp_crit_show;
-                       sysfs_attr_init(&temp->temp_crit.attr.attr);
-                       result = device_create_file(hwmon->device,
-                                                   &temp->temp_crit.attr);
-                       if (result)
-                               goto unregister_input;
-               }
-       }
-
-       mutex_lock(&thermal_list_lock);
-       if (new_hwmon_device)
-               list_add_tail(&hwmon->node, &thermal_hwmon_list);
-       list_add_tail(&temp->hwmon_node, &hwmon->tz_list);
-       mutex_unlock(&thermal_list_lock);
-
-       return 0;
-
- unregister_input:
-       device_remove_file(hwmon->device, &temp->temp_input.attr);
- free_temp_mem:
-       kfree(temp);
- unregister_name:
-       if (new_hwmon_device) {
-               device_remove_file(hwmon->device, &dev_attr_name);
-               hwmon_device_unregister(hwmon->device);
-       }
- free_mem:
-       if (new_hwmon_device)
-               kfree(hwmon);
-
-       return result;
-}
-
-static void
-thermal_remove_hwmon_sysfs(struct thermal_zone_device *tz)
-{
-       struct thermal_hwmon_device *hwmon;
-       struct thermal_hwmon_temp *temp;
-
-       hwmon = thermal_hwmon_lookup_by_type(tz);
-       if (unlikely(!hwmon)) {
-               /* Should never happen... */
-               dev_dbg(&tz->device, "hwmon device lookup failed!\n");
-               return;
-       }
-
-       temp = thermal_hwmon_lookup_temp(hwmon, tz);
-       if (unlikely(!temp)) {
-               /* Should never happen... */
-               dev_dbg(&tz->device, "temperature input lookup failed!\n");
-               return;
-       }
-
-       device_remove_file(hwmon->device, &temp->temp_input.attr);
-       if (tz->ops->get_crit_temp)
-               device_remove_file(hwmon->device, &temp->temp_crit.attr);
-
-       mutex_lock(&thermal_list_lock);
-       list_del(&temp->hwmon_node);
-       kfree(temp);
-       if (!list_empty(&hwmon->tz_list)) {
-               mutex_unlock(&thermal_list_lock);
-               return;
-       }
-       list_del(&hwmon->node);
-       mutex_unlock(&thermal_list_lock);
-
-       device_remove_file(hwmon->device, &dev_attr_name);
-       hwmon_device_unregister(hwmon->device);
-       kfree(hwmon);
-}
-#else
-static int
-thermal_add_hwmon_sysfs(struct thermal_zone_device *tz)
-{
-       return 0;
-}
-
-static void
-thermal_remove_hwmon_sysfs(struct thermal_zone_device *tz)
-{
-}
-#endif
-
-/**
- * thermal_zone_bind_cooling_device - bind a cooling device to a thermal zone
- * @tz:                thermal zone device
- * @trip:      indicates which trip point the cooling devices is
- *             associated with in this thermal zone.
- * @cdev:      thermal cooling device
- *
- * This function is usually called in the thermal zone device .bind callback.
- */
-int thermal_zone_bind_cooling_device(struct thermal_zone_device *tz,
-                                    int trip,
-                                    struct thermal_cooling_device *cdev,
-                                    unsigned long upper, unsigned long lower)
-{
-       struct thermal_instance *dev;
-       struct thermal_instance *pos;
-       struct thermal_zone_device *pos1;
-       struct thermal_cooling_device *pos2;
-       unsigned long max_state;
-       int result;
-
-       if (trip >= tz->trips || (trip < 0 && trip != THERMAL_TRIPS_NONE))
-               return -EINVAL;
-
-       list_for_each_entry(pos1, &thermal_tz_list, node) {
-               if (pos1 == tz)
-                       break;
-       }
-       list_for_each_entry(pos2, &thermal_cdev_list, node) {
-               if (pos2 == cdev)
-                       break;
-       }
-
-       if (tz != pos1 || cdev != pos2)
-               return -EINVAL;
-
-       cdev->ops->get_max_state(cdev, &max_state);
-
-       /* lower default 0, upper default max_state */
-       lower = lower == THERMAL_NO_LIMIT ? 0 : lower;
-       upper = upper == THERMAL_NO_LIMIT ? max_state : upper;
-
-       if (lower > upper || upper > max_state)
-               return -EINVAL;
-
-       dev =
-           kzalloc(sizeof(struct thermal_instance), GFP_KERNEL);
-       if (!dev)
-               return -ENOMEM;
-       dev->tz = tz;
-       dev->cdev = cdev;
-       dev->trip = trip;
-       dev->upper = upper;
-       dev->lower = lower;
-       dev->target = THERMAL_NO_TARGET;
-
-       result = get_idr(&tz->idr, &tz->lock, &dev->id);
-       if (result)
-               goto free_mem;
-
-       sprintf(dev->name, "cdev%d", dev->id);
-       result =
-           sysfs_create_link(&tz->device.kobj, &cdev->device.kobj, dev->name);
-       if (result)
-               goto release_idr;
-
-       sprintf(dev->attr_name, "cdev%d_trip_point", dev->id);
-       sysfs_attr_init(&dev->attr.attr);
-       dev->attr.attr.name = dev->attr_name;
-       dev->attr.attr.mode = 0444;
-       dev->attr.show = thermal_cooling_device_trip_point_show;
-       result = device_create_file(&tz->device, &dev->attr);
-       if (result)
-               goto remove_symbol_link;
-
-       mutex_lock(&tz->lock);
-       mutex_lock(&cdev->lock);
-       list_for_each_entry(pos, &tz->thermal_instances, tz_node)
-           if (pos->tz == tz && pos->trip == trip && pos->cdev == cdev) {
-               result = -EEXIST;
-               break;
-       }
-       if (!result) {
-               list_add_tail(&dev->tz_node, &tz->thermal_instances);
-               list_add_tail(&dev->cdev_node, &cdev->thermal_instances);
-       }
-       mutex_unlock(&cdev->lock);
-       mutex_unlock(&tz->lock);
-
-       if (!result)
-               return 0;
-
-       device_remove_file(&tz->device, &dev->attr);
-remove_symbol_link:
-       sysfs_remove_link(&tz->device.kobj, dev->name);
-release_idr:
-       release_idr(&tz->idr, &tz->lock, dev->id);
-free_mem:
-       kfree(dev);
-       return result;
-}
-EXPORT_SYMBOL(thermal_zone_bind_cooling_device);
-
-/**
- * thermal_zone_unbind_cooling_device - unbind a cooling device from a thermal zone
- * @tz:                thermal zone device
- * @trip:      indicates which trip point the cooling devices is
- *             associated with in this thermal zone.
- * @cdev:      thermal cooling device
- *
- * This function is usually called in the thermal zone device .unbind callback.
- */
-int thermal_zone_unbind_cooling_device(struct thermal_zone_device *tz,
-                                      int trip,
-                                      struct thermal_cooling_device *cdev)
-{
-       struct thermal_instance *pos, *next;
-
-       mutex_lock(&tz->lock);
-       mutex_lock(&cdev->lock);
-       list_for_each_entry_safe(pos, next, &tz->thermal_instances, tz_node) {
-               if (pos->tz == tz && pos->trip == trip && pos->cdev == cdev) {
-                       list_del(&pos->tz_node);
-                       list_del(&pos->cdev_node);
-                       mutex_unlock(&cdev->lock);
-                       mutex_unlock(&tz->lock);
-                       goto unbind;
-               }
-       }
-       mutex_unlock(&cdev->lock);
-       mutex_unlock(&tz->lock);
-
-       return -ENODEV;
-
-unbind:
-       device_remove_file(&tz->device, &pos->attr);
-       sysfs_remove_link(&tz->device.kobj, pos->name);
-       release_idr(&tz->idr, &tz->lock, pos->id);
-       kfree(pos);
-       return 0;
-}
-EXPORT_SYMBOL(thermal_zone_unbind_cooling_device);
-
-static void thermal_release(struct device *dev)
-{
-       struct thermal_zone_device *tz;
-       struct thermal_cooling_device *cdev;
-
-       if (!strncmp(dev_name(dev), "thermal_zone",
-                    sizeof("thermal_zone") - 1)) {
-               tz = to_thermal_zone(dev);
-               kfree(tz);
-       } else {
-               cdev = to_cooling_device(dev);
-               kfree(cdev);
-       }
-}
-
-static struct class thermal_class = {
-       .name = "thermal",
-       .dev_release = thermal_release,
-};
-
-/**
- * thermal_cooling_device_register - register a new thermal cooling device
- * @type:      the thermal cooling device type.
- * @devdata:   device private data.
- * @ops:               standard thermal cooling devices callbacks.
- */
-struct thermal_cooling_device *
-thermal_cooling_device_register(char *type, void *devdata,
-                               const struct thermal_cooling_device_ops *ops)
-{
-       struct thermal_cooling_device *cdev;
-       int result;
-
-       if (type && strlen(type) >= THERMAL_NAME_LENGTH)
-               return ERR_PTR(-EINVAL);
-
-       if (!ops || !ops->get_max_state || !ops->get_cur_state ||
-           !ops->set_cur_state)
-               return ERR_PTR(-EINVAL);
-
-       cdev = kzalloc(sizeof(struct thermal_cooling_device), GFP_KERNEL);
-       if (!cdev)
-               return ERR_PTR(-ENOMEM);
-
-       result = get_idr(&thermal_cdev_idr, &thermal_idr_lock, &cdev->id);
-       if (result) {
-               kfree(cdev);
-               return ERR_PTR(result);
-       }
-
-       strcpy(cdev->type, type ? : "");
-       mutex_init(&cdev->lock);
-       INIT_LIST_HEAD(&cdev->thermal_instances);
-       cdev->ops = ops;
-       cdev->updated = true;
-       cdev->device.class = &thermal_class;
-       cdev->devdata = devdata;
-       dev_set_name(&cdev->device, "cooling_device%d", cdev->id);
-       result = device_register(&cdev->device);
-       if (result) {
-               release_idr(&thermal_cdev_idr, &thermal_idr_lock, cdev->id);
-               kfree(cdev);
-               return ERR_PTR(result);
-       }
-
-       /* sys I/F */
-       if (type) {
-               result = device_create_file(&cdev->device, &dev_attr_cdev_type);
-               if (result)
-                       goto unregister;
-       }
-
-       result = device_create_file(&cdev->device, &dev_attr_max_state);
-       if (result)
-               goto unregister;
-
-       result = device_create_file(&cdev->device, &dev_attr_cur_state);
-       if (result)
-               goto unregister;
-
-       /* Add 'this' new cdev to the global cdev list */
-       mutex_lock(&thermal_list_lock);
-       list_add(&cdev->node, &thermal_cdev_list);
-       mutex_unlock(&thermal_list_lock);
-
-       /* Update binding information for 'this' new cdev */
-       bind_cdev(cdev);
-
-       return cdev;
-
-unregister:
-       release_idr(&thermal_cdev_idr, &thermal_idr_lock, cdev->id);
-       device_unregister(&cdev->device);
-       return ERR_PTR(result);
-}
-EXPORT_SYMBOL(thermal_cooling_device_register);
-
-/**
- * thermal_cooling_device_unregister - removes the registered thermal cooling device
- * @cdev:      the thermal cooling device to remove.
- *
- * thermal_cooling_device_unregister() must be called when the device is no
- * longer needed.
- */
-void thermal_cooling_device_unregister(struct thermal_cooling_device *cdev)
-{
-       int i;
-       const struct thermal_zone_params *tzp;
-       struct thermal_zone_device *tz;
-       struct thermal_cooling_device *pos = NULL;
-
-       if (!cdev)
-               return;
-
-       mutex_lock(&thermal_list_lock);
-       list_for_each_entry(pos, &thermal_cdev_list, node)
-           if (pos == cdev)
-               break;
-       if (pos != cdev) {
-               /* thermal cooling device not found */
-               mutex_unlock(&thermal_list_lock);
-               return;
-       }
-       list_del(&cdev->node);
-
-       /* Unbind all thermal zones associated with 'this' cdev */
-       list_for_each_entry(tz, &thermal_tz_list, node) {
-               if (tz->ops->unbind) {
-                       tz->ops->unbind(tz, cdev);
-                       continue;
-               }
-
-               if (!tz->tzp || !tz->tzp->tbp)
-                       continue;
-
-               tzp = tz->tzp;
-               for (i = 0; i < tzp->num_tbps; i++) {
-                       if (tzp->tbp[i].cdev == cdev) {
-                               __unbind(tz, tzp->tbp[i].trip_mask, cdev);
-                               tzp->tbp[i].cdev = NULL;
-                       }
-               }
-       }
-
-       mutex_unlock(&thermal_list_lock);
-
-       if (cdev->type[0])
-               device_remove_file(&cdev->device, &dev_attr_cdev_type);
-       device_remove_file(&cdev->device, &dev_attr_max_state);
-       device_remove_file(&cdev->device, &dev_attr_cur_state);
-
-       release_idr(&thermal_cdev_idr, &thermal_idr_lock, cdev->id);
-       device_unregister(&cdev->device);
-       return;
-}
-EXPORT_SYMBOL(thermal_cooling_device_unregister);
-
-void thermal_cdev_update(struct thermal_cooling_device *cdev)
-{
-       struct thermal_instance *instance;
-       unsigned long target = 0;
-
-       /* cooling device is updated*/
-       if (cdev->updated)
-               return;
-
-       mutex_lock(&cdev->lock);
-       /* Make sure cdev enters the deepest cooling state */
-       list_for_each_entry(instance, &cdev->thermal_instances, cdev_node) {
-               if (instance->target == THERMAL_NO_TARGET)
-                       continue;
-               if (instance->target > target)
-                       target = instance->target;
-       }
-       mutex_unlock(&cdev->lock);
-       cdev->ops->set_cur_state(cdev, target);
-       cdev->updated = true;
-}
-EXPORT_SYMBOL(thermal_cdev_update);
-
-/**
- * notify_thermal_framework - Sensor drivers use this API to notify framework
- * @tz:                thermal zone device
- * @trip:      indicates which trip point has been crossed
- *
- * This function handles the trip events from sensor drivers. It starts
- * throttling the cooling devices according to the policy configured.
- * For CRITICAL and HOT trip points, this notifies the respective drivers,
- * and does actual throttling for other trip points i.e ACTIVE and PASSIVE.
- * The throttling policy is based on the configured platform data; if no
- * platform data is provided, this uses the step_wise throttling policy.
- */
-void notify_thermal_framework(struct thermal_zone_device *tz, int trip)
-{
-       handle_thermal_trip(tz, trip);
-}
-EXPORT_SYMBOL(notify_thermal_framework);
-
-/**
- * create_trip_attrs - create attributes for trip points
- * @tz:                the thermal zone device
- * @mask:      Writeable trip point bitmap.
- */
-static int create_trip_attrs(struct thermal_zone_device *tz, int mask)
-{
-       int indx;
-       int size = sizeof(struct thermal_attr) * tz->trips;
-
-       tz->trip_type_attrs = kzalloc(size, GFP_KERNEL);
-       if (!tz->trip_type_attrs)
-               return -ENOMEM;
-
-       tz->trip_temp_attrs = kzalloc(size, GFP_KERNEL);
-       if (!tz->trip_temp_attrs) {
-               kfree(tz->trip_type_attrs);
-               return -ENOMEM;
-       }
-
-       if (tz->ops->get_trip_hyst) {
-               tz->trip_hyst_attrs = kzalloc(size, GFP_KERNEL);
-               if (!tz->trip_hyst_attrs) {
-                       kfree(tz->trip_type_attrs);
-                       kfree(tz->trip_temp_attrs);
-                       return -ENOMEM;
-               }
-       }
-
-
-       for (indx = 0; indx < tz->trips; indx++) {
-               /* create trip type attribute */
-               snprintf(tz->trip_type_attrs[indx].name, THERMAL_NAME_LENGTH,
-                        "trip_point_%d_type", indx);
-
-               sysfs_attr_init(&tz->trip_type_attrs[indx].attr.attr);
-               tz->trip_type_attrs[indx].attr.attr.name =
-                                               tz->trip_type_attrs[indx].name;
-               tz->trip_type_attrs[indx].attr.attr.mode = S_IRUGO;
-               tz->trip_type_attrs[indx].attr.show = trip_point_type_show;
-
-               device_create_file(&tz->device,
-                                  &tz->trip_type_attrs[indx].attr);
-
-               /* create trip temp attribute */
-               snprintf(tz->trip_temp_attrs[indx].name, THERMAL_NAME_LENGTH,
-                        "trip_point_%d_temp", indx);
-
-               sysfs_attr_init(&tz->trip_temp_attrs[indx].attr.attr);
-               tz->trip_temp_attrs[indx].attr.attr.name =
-                                               tz->trip_temp_attrs[indx].name;
-               tz->trip_temp_attrs[indx].attr.attr.mode = S_IRUGO;
-               tz->trip_temp_attrs[indx].attr.show = trip_point_temp_show;
-               if (mask & (1 << indx)) {
-                       tz->trip_temp_attrs[indx].attr.attr.mode |= S_IWUSR;
-                       tz->trip_temp_attrs[indx].attr.store =
-                                                       trip_point_temp_store;
-               }
-
-               device_create_file(&tz->device,
-                                  &tz->trip_temp_attrs[indx].attr);
-
-               /* create Optional trip hyst attribute */
-               if (!tz->ops->get_trip_hyst)
-                       continue;
-               snprintf(tz->trip_hyst_attrs[indx].name, THERMAL_NAME_LENGTH,
-                        "trip_point_%d_hyst", indx);
-
-               sysfs_attr_init(&tz->trip_hyst_attrs[indx].attr.attr);
-               tz->trip_hyst_attrs[indx].attr.attr.name =
-                                       tz->trip_hyst_attrs[indx].name;
-               tz->trip_hyst_attrs[indx].attr.attr.mode = S_IRUGO;
-               tz->trip_hyst_attrs[indx].attr.show = trip_point_hyst_show;
-               if (tz->ops->set_trip_hyst) {
-                       tz->trip_hyst_attrs[indx].attr.attr.mode |= S_IWUSR;
-                       tz->trip_hyst_attrs[indx].attr.store =
-                                       trip_point_hyst_store;
-               }
-
-               device_create_file(&tz->device,
-                                  &tz->trip_hyst_attrs[indx].attr);
-       }
-       return 0;
-}
-
-static void remove_trip_attrs(struct thermal_zone_device *tz)
-{
-       int indx;
-
-       for (indx = 0; indx < tz->trips; indx++) {
-               device_remove_file(&tz->device,
-                                  &tz->trip_type_attrs[indx].attr);
-               device_remove_file(&tz->device,
-                                  &tz->trip_temp_attrs[indx].attr);
-               if (tz->ops->get_trip_hyst)
-                       device_remove_file(&tz->device,
-                                 &tz->trip_hyst_attrs[indx].attr);
-       }
-       kfree(tz->trip_type_attrs);
-       kfree(tz->trip_temp_attrs);
-       kfree(tz->trip_hyst_attrs);
-}
-
-/**
- * thermal_zone_device_register - register a new thermal zone device
- * @type:      the thermal zone device type
- * @trips:     the number of trip points the thermal zone support
- * @mask:      a bit string indicating the writeablility of trip points
- * @devdata:   private device data
- * @ops:       standard thermal zone device callbacks
- * @tzp:       thermal zone platform parameters
- * @passive_delay: number of milliseconds to wait between polls when
- *                performing passive cooling
- * @polling_delay: number of milliseconds to wait between polls when checking
- *                whether trip points have been crossed (0 for interrupt
- *                driven systems)
- *
- * thermal_zone_device_unregister() must be called when the device is no
- * longer needed. The passive cooling depends on the .get_trend() return value.
- */
-struct thermal_zone_device *thermal_zone_device_register(const char *type,
-       int trips, int mask, void *devdata,
-       const struct thermal_zone_device_ops *ops,
-       const struct thermal_zone_params *tzp,
-       int passive_delay, int polling_delay)
-{
-       struct thermal_zone_device *tz;
-       enum thermal_trip_type trip_type;
-       int result;
-       int count;
-       int passive = 0;
-
-       if (type && strlen(type) >= THERMAL_NAME_LENGTH)
-               return ERR_PTR(-EINVAL);
-
-       if (trips > THERMAL_MAX_TRIPS || trips < 0 || mask >> trips)
-               return ERR_PTR(-EINVAL);
-
-       if (!ops || !ops->get_temp)
-               return ERR_PTR(-EINVAL);
-
-       if (trips > 0 && !ops->get_trip_type)
-               return ERR_PTR(-EINVAL);
-
-       tz = kzalloc(sizeof(struct thermal_zone_device), GFP_KERNEL);
-       if (!tz)
-               return ERR_PTR(-ENOMEM);
-
-       INIT_LIST_HEAD(&tz->thermal_instances);
-       idr_init(&tz->idr);
-       mutex_init(&tz->lock);
-       result = get_idr(&thermal_tz_idr, &thermal_idr_lock, &tz->id);
-       if (result) {
-               kfree(tz);
-               return ERR_PTR(result);
-       }
-
-       strcpy(tz->type, type ? : "");
-       tz->ops = ops;
-       tz->tzp = tzp;
-       tz->device.class = &thermal_class;
-       tz->devdata = devdata;
-       tz->trips = trips;
-       tz->passive_delay = passive_delay;
-       tz->polling_delay = polling_delay;
-
-       dev_set_name(&tz->device, "thermal_zone%d", tz->id);
-       result = device_register(&tz->device);
-       if (result) {
-               release_idr(&thermal_tz_idr, &thermal_idr_lock, tz->id);
-               kfree(tz);
-               return ERR_PTR(result);
-       }
-
-       /* sys I/F */
-       if (type) {
-               result = device_create_file(&tz->device, &dev_attr_type);
-               if (result)
-                       goto unregister;
-       }
-
-       result = device_create_file(&tz->device, &dev_attr_temp);
-       if (result)
-               goto unregister;
-
-       if (ops->get_mode) {
-               result = device_create_file(&tz->device, &dev_attr_mode);
-               if (result)
-                       goto unregister;
-       }
-
-       result = create_trip_attrs(tz, mask);
-       if (result)
-               goto unregister;
-
-       for (count = 0; count < trips; count++) {
-               tz->ops->get_trip_type(tz, count, &trip_type);
-               if (trip_type == THERMAL_TRIP_PASSIVE)
-                       passive = 1;
-       }
-
-       if (!passive) {
-               result = device_create_file(&tz->device, &dev_attr_passive);
-               if (result)
-                       goto unregister;
-       }
-
-#ifdef CONFIG_THERMAL_EMULATION
-       result = device_create_file(&tz->device, &dev_attr_emul_temp);
-       if (result)
-               goto unregister;
-#endif
-       /* Create policy attribute */
-       result = device_create_file(&tz->device, &dev_attr_policy);
-       if (result)
-               goto unregister;
-
-       /* Update 'this' zone's governor information */
-       mutex_lock(&thermal_governor_lock);
-
-       if (tz->tzp)
-               tz->governor = __find_governor(tz->tzp->governor_name);
-       else
-               tz->governor = __find_governor(DEFAULT_THERMAL_GOVERNOR);
-
-       mutex_unlock(&thermal_governor_lock);
-
-       result = thermal_add_hwmon_sysfs(tz);
-       if (result)
-               goto unregister;
-
-       mutex_lock(&thermal_list_lock);
-       list_add_tail(&tz->node, &thermal_tz_list);
-       mutex_unlock(&thermal_list_lock);
-
-       /* Bind cooling devices for this zone */
-       bind_tz(tz);
-
-       INIT_DELAYED_WORK(&(tz->poll_queue), thermal_zone_device_check);
-
-       thermal_zone_device_update(tz);
-
-       if (!result)
-               return tz;
-
-unregister:
-       release_idr(&thermal_tz_idr, &thermal_idr_lock, tz->id);
-       device_unregister(&tz->device);
-       return ERR_PTR(result);
-}
-EXPORT_SYMBOL(thermal_zone_device_register);
-
-/**
- * thermal_device_unregister - removes the registered thermal zone device
- * @tz: the thermal zone device to remove
- */
-void thermal_zone_device_unregister(struct thermal_zone_device *tz)
-{
-       int i;
-       const struct thermal_zone_params *tzp;
-       struct thermal_cooling_device *cdev;
-       struct thermal_zone_device *pos = NULL;
-
-       if (!tz)
-               return;
-
-       tzp = tz->tzp;
-
-       mutex_lock(&thermal_list_lock);
-       list_for_each_entry(pos, &thermal_tz_list, node)
-           if (pos == tz)
-               break;
-       if (pos != tz) {
-               /* thermal zone device not found */
-               mutex_unlock(&thermal_list_lock);
-               return;
-       }
-       list_del(&tz->node);
-
-       /* Unbind all cdevs associated with 'this' thermal zone */
-       list_for_each_entry(cdev, &thermal_cdev_list, node) {
-               if (tz->ops->unbind) {
-                       tz->ops->unbind(tz, cdev);
-                       continue;
-               }
-
-               if (!tzp || !tzp->tbp)
-                       break;
-
-               for (i = 0; i < tzp->num_tbps; i++) {
-                       if (tzp->tbp[i].cdev == cdev) {
-                               __unbind(tz, tzp->tbp[i].trip_mask, cdev);
-                               tzp->tbp[i].cdev = NULL;
-                       }
-               }
-       }
-
-       mutex_unlock(&thermal_list_lock);
-
-       thermal_zone_device_set_polling(tz, 0);
-
-       if (tz->type[0])
-               device_remove_file(&tz->device, &dev_attr_type);
-       device_remove_file(&tz->device, &dev_attr_temp);
-       if (tz->ops->get_mode)
-               device_remove_file(&tz->device, &dev_attr_mode);
-       device_remove_file(&tz->device, &dev_attr_policy);
-       remove_trip_attrs(tz);
-       tz->governor = NULL;
-
-       thermal_remove_hwmon_sysfs(tz);
-       release_idr(&thermal_tz_idr, &thermal_idr_lock, tz->id);
-       idr_destroy(&tz->idr);
-       mutex_destroy(&tz->lock);
-       device_unregister(&tz->device);
-       return;
-}
-EXPORT_SYMBOL(thermal_zone_device_unregister);
-
-#ifdef CONFIG_NET
-static struct genl_family thermal_event_genl_family = {
-       .id = GENL_ID_GENERATE,
-       .name = THERMAL_GENL_FAMILY_NAME,
-       .version = THERMAL_GENL_VERSION,
-       .maxattr = THERMAL_GENL_ATTR_MAX,
-};
-
-static struct genl_multicast_group thermal_event_mcgrp = {
-       .name = THERMAL_GENL_MCAST_GROUP_NAME,
-};
-
-int thermal_generate_netlink_event(struct thermal_zone_device *tz,
-                                       enum events event)
-{
-       struct sk_buff *skb;
-       struct nlattr *attr;
-       struct thermal_genl_event *thermal_event;
-       void *msg_header;
-       int size;
-       int result;
-       static unsigned int thermal_event_seqnum;
-
-       if (!tz)
-               return -EINVAL;
-
-       /* allocate memory */
-       size = nla_total_size(sizeof(struct thermal_genl_event)) +
-              nla_total_size(0);
-
-       skb = genlmsg_new(size, GFP_ATOMIC);
-       if (!skb)
-               return -ENOMEM;
-
-       /* add the genetlink message header */
-       msg_header = genlmsg_put(skb, 0, thermal_event_seqnum++,
-                                &thermal_event_genl_family, 0,
-                                THERMAL_GENL_CMD_EVENT);
-       if (!msg_header) {
-               nlmsg_free(skb);
-               return -ENOMEM;
-       }
-
-       /* fill the data */
-       attr = nla_reserve(skb, THERMAL_GENL_ATTR_EVENT,
-                          sizeof(struct thermal_genl_event));
-
-       if (!attr) {
-               nlmsg_free(skb);
-               return -EINVAL;
-       }
-
-       thermal_event = nla_data(attr);
-       if (!thermal_event) {
-               nlmsg_free(skb);
-               return -EINVAL;
-       }
-
-       memset(thermal_event, 0, sizeof(struct thermal_genl_event));
-
-       thermal_event->orig = tz->id;
-       thermal_event->event = event;
-
-       /* send multicast genetlink message */
-       result = genlmsg_end(skb, msg_header);
-       if (result < 0) {
-               nlmsg_free(skb);
-               return result;
-       }
-
-       result = genlmsg_multicast(skb, 0, thermal_event_mcgrp.id, GFP_ATOMIC);
-       if (result)
-               dev_err(&tz->device, "Failed to send netlink event:%d", result);
-
-       return result;
-}
-EXPORT_SYMBOL(thermal_generate_netlink_event);
-
-static int genetlink_init(void)
-{
-       int result;
-
-       result = genl_register_family(&thermal_event_genl_family);
-       if (result)
-               return result;
-
-       result = genl_register_mc_group(&thermal_event_genl_family,
-                                       &thermal_event_mcgrp);
-       if (result)
-               genl_unregister_family(&thermal_event_genl_family);
-       return result;
-}
-
-static void genetlink_exit(void)
-{
-       genl_unregister_family(&thermal_event_genl_family);
-}
-#else /* !CONFIG_NET */
-static inline int genetlink_init(void) { return 0; }
-static inline void genetlink_exit(void) {}
-#endif /* !CONFIG_NET */
-
-static int __init thermal_init(void)
-{
-       int result = 0;
-
-       result = class_register(&thermal_class);
-       if (result) {
-               idr_destroy(&thermal_tz_idr);
-               idr_destroy(&thermal_cdev_idr);
-               mutex_destroy(&thermal_idr_lock);
-               mutex_destroy(&thermal_list_lock);
-               return result;
-       }
-       result = genetlink_init();
-       return result;
-}
-
-static void __exit thermal_exit(void)
-{
-       class_unregister(&thermal_class);
-       idr_destroy(&thermal_tz_idr);
-       idr_destroy(&thermal_cdev_idr);
-       mutex_destroy(&thermal_idr_lock);
-       mutex_destroy(&thermal_list_lock);
-       genetlink_exit();
-}
-
-fs_initcall(thermal_init);
-module_exit(thermal_exit);
diff --git a/drivers/thermal/user_space.c b/drivers/thermal/user_space.c

index 6bbb380b6d19ecf0c6e1e245dceee040c08e2a3b..10adcddc88211e9589f27b1ab6c79eaa75cb7fe7 100644 (file)
--- a/drivers/thermal/user_space.c
+++ b/drivers/thermal/user_space.c
@@ -22,9 +22,6 @@
   * ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
   */
  
-#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
-
-#include <linux/module.h>
  #include <linux/thermal.h>
  
  #include "thermal_core.h"
@@ -46,23 +43,15 @@ static int notify_user_space(struct thermal_zone_device *tz, int trip)
  static struct thermal_governor thermal_gov_user_space = {
         .name           = "user_space",
         .throttle       = notify_user_space,
-       .owner          = THIS_MODULE,
  };
  
-static int __init thermal_gov_user_space_init(void)
+int thermal_gov_user_space_register(void)
  {
         return thermal_register_governor(&thermal_gov_user_space);
  }
  
-static void __exit thermal_gov_user_space_exit(void)
+void thermal_gov_user_space_unregister(void)
  {
         thermal_unregister_governor(&thermal_gov_user_space);
  }
  
-/* This should load after thermal framework */
-fs_initcall(thermal_gov_user_space_init);
-module_exit(thermal_gov_user_space_exit);
-
-MODULE_AUTHOR("Durgadoss R");
-MODULE_DESCRIPTION("A user space Thermal notifier");
-MODULE_LICENSE("GPL");
diff --git a/drivers/usb/gadget/inode.c b/drivers/usb/gadget/inode.c

index dda0dc4a55674756c76cb0efdf0dd6973d1b6109..570c005062ab8a3c75a9da56131ffebfdce118aa 100644 (file)
--- a/drivers/usb/gadget/inode.c
+++ b/drivers/usb/gadget/inode.c
@@ -24,6 +24,8 @@
  #include <linux/sched.h>
  #include <linux/slab.h>
  #include <linux/poll.h>
+#include <linux/mmu_context.h>
+#include <linux/aio.h>
  
  #include <linux/device.h>
  #include <linux/moduleparam.h>
@@ -513,6 +515,9 @@ static long ep_ioctl(struct file *fd, unsigned code, unsigned long value)
  struct kiocb_priv {
         struct usb_request      *req;
         struct ep_data          *epdata;
+       struct kiocb            *iocb;
+       struct mm_struct        *mm;
+       struct work_struct      work;
         void                    *buf;
         const struct iovec      *iv;
         unsigned long           nr_segs;
@@ -528,7 +533,6 @@ static int ep_aio_cancel(struct kiocb *iocb, struct io_event *e)
         local_irq_disable();
         epdata = priv->epdata;
         // spin_lock(&epdata->dev->lock);
-       kiocbSetCancelled(iocb);
         if (likely(epdata && epdata->ep && priv->req))
                 value = usb_ep_dequeue (epdata->ep, priv->req);
         else
@@ -540,15 +544,12 @@ static int ep_aio_cancel(struct kiocb *iocb, struct io_event *e)
         return value;
  }
  
-static ssize_t ep_aio_read_retry(struct kiocb *iocb)
+static ssize_t ep_copy_to_user(struct kiocb_priv *priv)
  {
-       struct kiocb_priv       *priv = iocb->private;
         ssize_t                 len, total;
         void                    *to_copy;
         int                     i;
  
-       /* we "retry" to get the right mm context for this: */
-
         /* copy stuff into user buffers */
         total = priv->actual;
         len = 0;
@@ -568,9 +569,26 @@ static ssize_t ep_aio_read_retry(struct kiocb *iocb)
                 if (total == 0)
                         break;
         }
+
+       return len;
+}
+
+static void ep_user_copy_worker(struct work_struct *work)
+{
+       struct kiocb_priv *priv = container_of(work, struct kiocb_priv, work);
+       struct mm_struct *mm = priv->mm;
+       struct kiocb *iocb = priv->iocb;
+       size_t ret;
+
+       use_mm(mm);
+       ret = ep_copy_to_user(priv);
+       unuse_mm(mm);
+
+       /* completing the iocb can drop the ctx and mm, don't touch mm after */
+       aio_complete(iocb, ret, ret);
+
         kfree(priv->buf);
         kfree(priv);
-       return len;
  }
  
  static void ep_aio_complete(struct usb_ep *ep, struct usb_request *req)
@@ -596,14 +614,14 @@ static void ep_aio_complete(struct usb_ep *ep, struct usb_request *req)
                 aio_complete(iocb, req->actual ? req->actual : req->status,
                                 req->status);
         } else {
-               /* retry() won't report both; so we hide some faults */
+               /* ep_copy_to_user() won't report both; we hide some faults */
                 if (unlikely(0 != req->status))
                         DBG(epdata->dev, "%s fault %d len %d\n",
                                 ep->name, req->status, req->actual);
  
                 priv->buf = req->buf;
                 priv->actual = req->actual;
-               kick_iocb(iocb);
+               schedule_work(&priv->work);
         }
         spin_unlock(&epdata->dev->lock);
  
@@ -633,8 +651,10 @@ fail:
                 return value;
         }
         iocb->private = priv;
+       priv->iocb = iocb;
         priv->iv = iv;
         priv->nr_segs = nr_segs;
+       INIT_WORK(&priv->work, ep_user_copy_worker);
  
         value = get_ready_ep(iocb->ki_filp->f_flags, epdata);
         if (unlikely(value < 0)) {
@@ -642,10 +662,11 @@ fail:
                 goto fail;
         }
  
-       iocb->ki_cancel = ep_aio_cancel;
+       kiocb_set_cancel_fn(iocb, ep_aio_cancel);
         get_ep(epdata);
         priv->epdata = epdata;
         priv->actual = 0;
+       priv->mm = current->mm; /* mm teardown waits for iocbs in exit_aio() */
  
         /* each kiocb is coupled to one usb_request, but we can't
          * allocate or submit those if the host disconnected.
@@ -674,7 +695,7 @@ fail:
                 kfree(priv);
                 put_ep(epdata);
         } else
-               value = (iv ? -EIOCBRETRY : -EIOCBQUEUED);
+               value = -EIOCBQUEUED;
         return value;
  }
  
@@ -692,7 +713,6 @@ ep_aio_read(struct kiocb *iocb, const struct iovec *iov,
         if (unlikely(!buf))
                 return -ENOMEM;
  
-       iocb->ki_retry = ep_aio_read_retry;
         return ep_aio_rwtail(iocb, buf, iocb->ki_left, epdata, iov, nr_segs);
  }
  
diff --git a/drivers/usb/phy/Kconfig b/drivers/usb/phy/Kconfig

index aab2ab2fbc90d050a474cb39df7563bbfedb8633..371d0e74e9094132eb378366858dedd7e654d89f 100644 (file)
--- a/drivers/usb/phy/Kconfig
+++ b/drivers/usb/phy/Kconfig
@@ -128,7 +128,7 @@ config TWL6030_USB
  
  config USB_GPIO_VBUS
         tristate "GPIO based peripheral-only VBUS sensing 'transceiver'"
-       depends on GENERIC_GPIO
+       depends on GPIOLIB
         help
           Provides simple GPIO VBUS sensing for controllers with an
           internal transceiver via the usb_phy interface, and
diff --git a/drivers/video/Kconfig b/drivers/video/Kconfig

index c04ccdf60eaa21ddd4d441b07ad6b3201ce62614..d71d60f94fc19a7fcc34ace0940dfb841e9904fc 100644 (file)
--- a/drivers/video/Kconfig
+++ b/drivers/video/Kconfig
@@ -2429,7 +2429,7 @@ config FB_MXS
         select FB_CFB_COPYAREA
         select FB_CFB_IMAGEBLIT
         select FB_MODE_HELPERS
-       select OF_VIDEOMODE
+       select VIDEOMODE_HELPERS
         help
           Framebuffer support for the MXS SoC.
  
@@ -2483,7 +2483,7 @@ config FB_SSD1307
         tristate "Solomon SSD1307 framebuffer support"
         depends on FB && I2C
         depends on OF
-       depends on GENERIC_GPIO
+       depends on GPIOLIB
         select FB_SYS_FOPS
         select FB_SYS_FILLRECT
         select FB_SYS_COPYAREA
diff --git a/drivers/video/backlight/Kconfig b/drivers/video/backlight/Kconfig

index 2e166c3fc4c354c1184343b265d17ef02f6c212f..d5ab6583f440492fe1c3caa77001e32bcfcf0efa 100644 (file)
--- a/drivers/video/backlight/Kconfig
+++ b/drivers/video/backlight/Kconfig
@@ -36,14 +36,14 @@ config LCD_CORGI
  
  config LCD_L4F00242T03
         tristate "Epson L4F00242T03 LCD"
-       depends on SPI_MASTER && GENERIC_GPIO
+       depends on SPI_MASTER && GPIOLIB
         help
           SPI driver for Epson L4F00242T03. This provides basic support
           for init and powering the LCD up/down through a sysfs interface.
  
  config LCD_LMS283GF05
         tristate "Samsung LMS283GF05 LCD"
-       depends on SPI_MASTER && GENERIC_GPIO
+       depends on SPI_MASTER && GPIOLIB
         help
           SPI driver for Samsung LMS283GF05. This provides basic support
           for powering the LCD up/down through a sysfs interface.
diff --git a/drivers/video/mxsfb.c b/drivers/video/mxsfb.c

index 1b2c26d1658c65ed2fd73bb8a43462f537fe7f4a..21223d475b39bb54aff2f58e95a73282f37754dd 100644 (file)
--- a/drivers/video/mxsfb.c
+++ b/drivers/video/mxsfb.c
@@ -42,7 +42,6 @@
  #include <linux/module.h>
  #include <linux/kernel.h>
  #include <linux/of_device.h>
-#include <video/of_display_timing.h>
  #include <linux/platform_device.h>
  #include <linux/clk.h>
  #include <linux/dma-mapping.h>
@@ -50,6 +49,7 @@
  #include <linux/pinctrl/consumer.h>
  #include <linux/fb.h>
  #include <linux/regulator/consumer.h>
+#include <video/of_display_timing.h>
  #include <video/videomode.h>
  
  #define REG_SET        4
@@ -777,16 +777,16 @@ static int mxsfb_init_fbinfo_dt(struct mxsfb_info *host)
                 struct videomode vm;
                 struct fb_videomode fb_vm;
  
-               ret = videomode_from_timing(timings, &vm, i);
+               ret = videomode_from_timings(timings, &vm, i);
                 if (ret < 0)
                         goto put_timings_node;
                 ret = fb_videomode_from_videomode(&vm, &fb_vm);
                 if (ret < 0)
                         goto put_timings_node;
  
-               if (vm.data_flags & DISPLAY_FLAGS_DE_HIGH)
+               if (vm.flags & DISPLAY_FLAGS_DE_HIGH)
                         host->sync |= MXSFB_SYNC_DATA_ENABLE_HIGH_ACT;
-               if (vm.data_flags & DISPLAY_FLAGS_PIXDATA_NEGEDGE)
+               if (vm.flags & DISPLAY_FLAGS_PIXDATA_NEGEDGE)
                         host->sync |= MXSFB_SYNC_DOTCLK_FALLING_ACT;
                 fb_add_videomode(&fb_vm, &fb_info->modelist);
         }
diff --git a/drivers/w1/masters/Kconfig b/drivers/w1/masters/Kconfig

index e8ca63a82b9777d6ae66cdcb34be199bd5fc7d7c..2bd1257dcc1cb0a1809f544a884a3e1a3dbe9128 100644 (file)
--- a/drivers/w1/masters/Kconfig
+++ b/drivers/w1/masters/Kconfig
@@ -50,7 +50,7 @@ config W1_MASTER_DS1WM
  
  config W1_MASTER_GPIO
         tristate "GPIO 1-wire busmaster"
-       depends on GENERIC_GPIO
+       depends on GPIOLIB
         help
           Say Y here if you want to communicate with your 1-wire devices using
           GPIO pins. This driver uses the GPIO API to control the wire.
diff --git a/drivers/watchdog/ath79_wdt.c b/drivers/watchdog/ath79_wdt.c

index 898799074a13b87bf35113dbb6b1f9550de352b0..d184c48a0482b6a72fab3db591b5a8c425b531ab 100644 (file)
--- a/drivers/watchdog/ath79_wdt.c
+++ b/drivers/watchdog/ath79_wdt.c
@@ -253,11 +253,9 @@ static int ath79_wdt_probe(struct platform_device *pdev)
                 return -EINVAL;
         }
  
-       wdt_base = devm_request_and_ioremap(&pdev->dev, res);
-       if (!wdt_base) {
-               dev_err(&pdev->dev, "unable to remap memory region\n");
-               return -ENOMEM;
-       }
+       wdt_base = devm_ioremap_resource(&pdev->dev, res);
+       if (IS_ERR(wdt_base))
+               return PTR_ERR(wdt_base);
  
         wdt_clk = devm_clk_get(&pdev->dev, "wdt");
         if (IS_ERR(wdt_clk))
diff --git a/drivers/watchdog/davinci_wdt.c b/drivers/watchdog/davinci_wdt.c

index 7df1fdca9e7811672f74aa57be7f85d901662ad4..100d4fbfde2adf594203cd5a874dd72f20e06b03 100644 (file)
--- a/drivers/watchdog/davinci_wdt.c
+++ b/drivers/watchdog/davinci_wdt.c
@@ -27,6 +27,7 @@
  #include <linux/device.h>
  #include <linux/clk.h>
  #include <linux/slab.h>
+#include <linux/err.h>
  
  #define MODULE_NAME "DAVINCI-WDT: "
  
@@ -221,11 +222,9 @@ static int davinci_wdt_probe(struct platform_device *pdev)
                 return -ENOENT;
         }
  
-       wdt_base = devm_request_and_ioremap(dev, wdt_mem);
-       if (!wdt_base) {
-               dev_err(dev, "ioremap failed\n");
-               return -EADDRNOTAVAIL;
-       }
+       wdt_base = devm_ioremap_resource(dev, wdt_mem);
+       if (IS_ERR(wdt_base))
+               return PTR_ERR(wdt_base);
  
         ret = misc_register(&davinci_wdt_miscdev);
         if (ret < 0) {
diff --git a/drivers/watchdog/s3c2410_wdt.c b/drivers/watchdog/s3c2410_wdt.c

index c1a221cbeae4eec83a94aeb730b9c5e6d9a540ac..ee03135f5abd009bda6f4183d6debf64185131bf 100644 (file)
--- a/drivers/watchdog/s3c2410_wdt.c
+++ b/drivers/watchdog/s3c2410_wdt.c
@@ -330,10 +330,9 @@ static int s3c2410wdt_probe(struct platform_device *pdev)
         }
  
         /* get the memory region for the watchdog timer */
-       wdt_base = devm_request_and_ioremap(dev, wdt_mem);
-       if (wdt_base == NULL) {
-               dev_err(dev, "failed to devm_request_and_ioremap() region\n");
-               ret = -ENOMEM;
+       wdt_base = devm_ioremap_resource(dev, wdt_mem);
+       if (IS_ERR(wdt_base)) {
+               ret = PTR_ERR(wdt_base);
                 goto err;
         }
  
diff --git a/drivers/watchdog/shwdt.c b/drivers/watchdog/shwdt.c

index 6a89e4045fbd2bdef12e251bfba3044c469f0401..6185af2b33109c5ce8c1ef71a93b35eb2542bfdd 100644 (file)
--- a/drivers/watchdog/shwdt.c
+++ b/drivers/watchdog/shwdt.c
@@ -34,6 +34,7 @@
  #include <linux/slab.h>
  #include <linux/io.h>
  #include <linux/clk.h>
+#include <linux/err.h>
  #include <asm/watchdog.h>
  
  #define DRV_NAME "sh-wdt"
@@ -249,9 +250,9 @@ static int sh_wdt_probe(struct platform_device *pdev)
                 wdt->clk = NULL;
         }
  
-       wdt->base = devm_request_and_ioremap(wdt->dev, res);
-       if (unlikely(!wdt->base)) {
-               rc = -EADDRNOTAVAIL;
+       wdt->base = devm_ioremap_resource(wdt->dev, res);
+       if (IS_ERR(wdt->base)) {
+               rc = PTR_ERR(wdt->base);
                 goto err;
         }
  
diff --git a/drivers/watchdog/watchdog_dev.c b/drivers/watchdog/watchdog_dev.c

index 08b48bbf9f4b2d3c65cf18045ff90c1465c9787f..faf4e189fe423179cb0261e77a3519a2a2b1c03e 100644 (file)
--- a/drivers/watchdog/watchdog_dev.c
+++ b/drivers/watchdog/watchdog_dev.c
@@ -523,6 +523,7 @@ int watchdog_dev_register(struct watchdog_device *watchdog)
         int err, devno;
  
         if (watchdog->id == 0) {
+               old_wdd = watchdog;
                 watchdog_miscdev.parent = watchdog->parent;
                 err = misc_register(&watchdog_miscdev);
                 if (err != 0) {
@@ -531,9 +532,9 @@ int watchdog_dev_register(struct watchdog_device *watchdog)
                         if (err == -EBUSY)
                                 pr_err("%s: a legacy watchdog module is probably present.\n",
                                         watchdog->info->identity);
+                       old_wdd = NULL;
                         return err;
                 }
-               old_wdd = watchdog;
         }
  
         /* Fill in the data structures */
diff --git a/fs/9p/vfs_addr.c b/fs/9p/vfs_addr.c

index 0ad61c6a65a5b1964b373432f5e9bf9b031f6d38..055562c580b43e673615811f4bdc6e778180420b 100644 (file)
--- a/fs/9p/vfs_addr.c
+++ b/fs/9p/vfs_addr.c
@@ -33,6 +33,7 @@
  #include <linux/pagemap.h>
  #include <linux/idr.h>
  #include <linux/sched.h>
+#include <linux/aio.h>
  #include <net/9p/9p.h>
  #include <net/9p/client.h>
  
diff --git a/fs/afs/write.c b/fs/afs/write.c

index 7e03eadb40c0a71d1dfd3971f8136abae2bb7822..a890db4b9898fc1d888c5e7285da55db85e4da54 100644 (file)
--- a/fs/afs/write.c
+++ b/fs/afs/write.c
@@ -14,6 +14,7 @@
  #include <linux/pagemap.h>
  #include <linux/writeback.h>
  #include <linux/pagevec.h>
+#include <linux/aio.h>
  #include "internal.h"
  
  static int afs_write_back_from_locked_page(struct afs_writeback *wb,
diff --git a/fs/aio.c b/fs/aio.c

index 351afe7ac78ebbed1dc331716c5c2ba46c16faf2..c5b1a8c10411ab108960eb74ed20f1b1a4bed601 100644 (file)
--- a/fs/aio.c
+++ b/fs/aio.c
@@ -8,6 +8,8 @@
   *
   *     See ../COPYING for licensing terms.
   */
+#define pr_fmt(fmt) "%s: " fmt, __func__
+
  #include <linux/kernel.h>
  #include <linux/init.h>
  #include <linux/errno.h>
@@ -18,8 +20,6 @@
  #include <linux/backing-dev.h>
  #include <linux/uio.h>
  
-#define DEBUG 0
-
  #include <linux/sched.h>
  #include <linux/fs.h>
  #include <linux/file.h>
@@ -39,11 +39,76 @@
  #include <asm/kmap_types.h>
  #include <asm/uaccess.h>
  
-#if DEBUG > 1
-#define dprintk                printk
-#else
-#define dprintk(x...)  do { ; } while (0)
-#endif
+#define AIO_RING_MAGIC                 0xa10a10a1
+#define AIO_RING_COMPAT_FEATURES       1
+#define AIO_RING_INCOMPAT_FEATURES     0
+struct aio_ring {
+       unsigned        id;     /* kernel internal index number */
+       unsigned        nr;     /* number of io_events */
+       unsigned        head;
+       unsigned        tail;
+
+       unsigned        magic;
+       unsigned        compat_features;
+       unsigned        incompat_features;
+       unsigned        header_length;  /* size of aio_ring */
+
+
+       struct io_event         io_events[0];
+}; /* 128 bytes + ring size */
+
+#define AIO_RING_PAGES 8
+
+struct kioctx {
+       atomic_t                users;
+       atomic_t                dead;
+
+       /* This needs improving */
+       unsigned long           user_id;
+       struct hlist_node       list;
+
+       /*
+        * This is what userspace passed to io_setup(), it's not used for
+        * anything but counting against the global max_reqs quota.
+        *
+        * The real limit is nr_events - 1, which will be larger (see
+        * aio_setup_ring())
+        */
+       unsigned                max_reqs;
+
+       /* Size of ringbuffer, in units of struct io_event */
+       unsigned                nr_events;
+
+       unsigned long           mmap_base;
+       unsigned long           mmap_size;
+
+       struct page             **ring_pages;
+       long                    nr_pages;
+
+       struct rcu_head         rcu_head;
+       struct work_struct      rcu_work;
+
+       struct {
+               atomic_t        reqs_active;
+       } ____cacheline_aligned_in_smp;
+
+       struct {
+               spinlock_t      ctx_lock;
+               struct list_head active_reqs;   /* used for cancellation */
+       } ____cacheline_aligned_in_smp;
+
+       struct {
+               struct mutex    ring_lock;
+               wait_queue_head_t wait;
+       } ____cacheline_aligned_in_smp;
+
+       struct {
+               unsigned        tail;
+               spinlock_t      completion_lock;
+       } ____cacheline_aligned_in_smp;
+
+       struct page             *internal_pages[AIO_RING_PAGES];
+};
  
  /*------ sysctl variables----*/
  static DEFINE_SPINLOCK(aio_nr_lock);
@@ -54,11 +119,6 @@ unsigned long aio_max_nr = 0x10000; /* system wide maximum number of aio request
  static struct kmem_cache       *kiocb_cachep;
  static struct kmem_cache       *kioctx_cachep;
  
-static struct workqueue_struct *aio_wq;
-
-static void aio_kick_handler(struct work_struct *);
-static void aio_queue_work(struct kioctx *);
-
  /* aio_setup
   *     Creates the slab caches used by the aio routines, panic on
   *     failure as this is done early during the boot sequence.
@@ -68,10 +128,7 @@ static int __init aio_setup(void)
         kiocb_cachep = KMEM_CACHE(kiocb, SLAB_HWCACHE_ALIGN|SLAB_PANIC);
         kioctx_cachep = KMEM_CACHE(kioctx,SLAB_HWCACHE_ALIGN|SLAB_PANIC);
  
-       aio_wq = alloc_workqueue("aio", 0, 1);  /* used to limit concurrency */
-       BUG_ON(!aio_wq);
-
-       pr_debug("aio_setup: sizeof(struct page) = %d\n", (int)sizeof(struct page));
+       pr_debug("sizeof(struct page) = %zu\n", sizeof(struct page));
  
         return 0;
  }
@@ -79,28 +136,23 @@ __initcall(aio_setup);
  
  static void aio_free_ring(struct kioctx *ctx)
  {
-       struct aio_ring_info *info = &ctx->ring_info;
         long i;
  
-       for (i=0; i<info->nr_pages; i++)
-               put_page(info->ring_pages[i]);
+       for (i = 0; i < ctx->nr_pages; i++)
+               put_page(ctx->ring_pages[i]);
  
-       if (info->mmap_size) {
-               BUG_ON(ctx->mm != current->mm);
-               vm_munmap(info->mmap_base, info->mmap_size);
-       }
+       if (ctx->mmap_size)
+               vm_munmap(ctx->mmap_base, ctx->mmap_size);
  
-       if (info->ring_pages && info->ring_pages != info->internal_pages)
-               kfree(info->ring_pages);
-       info->ring_pages = NULL;
-       info->nr = 0;
+       if (ctx->ring_pages && ctx->ring_pages != ctx->internal_pages)
+               kfree(ctx->ring_pages);
  }
  
  static int aio_setup_ring(struct kioctx *ctx)
  {
         struct aio_ring *ring;
-       struct aio_ring_info *info = &ctx->ring_info;
         unsigned nr_events = ctx->max_reqs;
+       struct mm_struct *mm = current->mm;
         unsigned long size, populate;
         int nr_pages;
  
@@ -116,46 +168,44 @@ static int aio_setup_ring(struct kioctx *ctx)
  
         nr_events = (PAGE_SIZE * nr_pages - sizeof(struct aio_ring)) / sizeof(struct io_event);
  
-       info->nr = 0;
-       info->ring_pages = info->internal_pages;
+       ctx->nr_events = 0;
+       ctx->ring_pages = ctx->internal_pages;
         if (nr_pages > AIO_RING_PAGES) {
-               info->ring_pages = kcalloc(nr_pages, sizeof(struct page *), GFP_KERNEL);
-               if (!info->ring_pages)
+               ctx->ring_pages = kcalloc(nr_pages, sizeof(struct page *),
+                                         GFP_KERNEL);
+               if (!ctx->ring_pages)
                         return -ENOMEM;
         }
  
-       info->mmap_size = nr_pages * PAGE_SIZE;
-       dprintk("attempting mmap of %lu bytes\n", info->mmap_size);
-       down_write(&ctx->mm->mmap_sem);
-       info->mmap_base = do_mmap_pgoff(NULL, 0, info->mmap_size, 
-                                       PROT_READ|PROT_WRITE,
-                                       MAP_ANONYMOUS|MAP_PRIVATE, 0,
-                                       &populate);
-       if (IS_ERR((void *)info->mmap_base)) {
-               up_write(&ctx->mm->mmap_sem);
-               info->mmap_size = 0;
+       ctx->mmap_size = nr_pages * PAGE_SIZE;
+       pr_debug("attempting mmap of %lu bytes\n", ctx->mmap_size);
+       down_write(&mm->mmap_sem);
+       ctx->mmap_base = do_mmap_pgoff(NULL, 0, ctx->mmap_size,
+                                      PROT_READ|PROT_WRITE,
+                                      MAP_ANONYMOUS|MAP_PRIVATE, 0, &populate);
+       if (IS_ERR((void *)ctx->mmap_base)) {
+               up_write(&mm->mmap_sem);
+               ctx->mmap_size = 0;
                 aio_free_ring(ctx);
                 return -EAGAIN;
         }
  
-       dprintk("mmap address: 0x%08lx\n", info->mmap_base);
-       info->nr_pages = get_user_pages(current, ctx->mm,
-                                       info->mmap_base, nr_pages, 
-                                       1, 0, info->ring_pages, NULL);
-       up_write(&ctx->mm->mmap_sem);
+       pr_debug("mmap address: 0x%08lx\n", ctx->mmap_base);
+       ctx->nr_pages = get_user_pages(current, mm, ctx->mmap_base, nr_pages,
+                                      1, 0, ctx->ring_pages, NULL);
+       up_write(&mm->mmap_sem);
  
-       if (unlikely(info->nr_pages != nr_pages)) {
+       if (unlikely(ctx->nr_pages != nr_pages)) {
                 aio_free_ring(ctx);
                 return -EAGAIN;
         }
         if (populate)
-               mm_populate(info->mmap_base, populate);
+               mm_populate(ctx->mmap_base, populate);
  
-       ctx->user_id = info->mmap_base;
+       ctx->user_id = ctx->mmap_base;
+       ctx->nr_events = nr_events; /* trusted copy */
  
-       info->nr = nr_events;           /* trusted copy */
-
-       ring = kmap_atomic(info->ring_pages[0]);
+       ring = kmap_atomic(ctx->ring_pages[0]);
         ring->nr = nr_events;   /* user copy */
         ring->id = ctx->user_id;
         ring->head = ring->tail = 0;
@@ -164,72 +214,133 @@ static int aio_setup_ring(struct kioctx *ctx)
         ring->incompat_features = AIO_RING_INCOMPAT_FEATURES;
         ring->header_length = sizeof(struct aio_ring);
         kunmap_atomic(ring);
+       flush_dcache_page(ctx->ring_pages[0]);
  
         return 0;
  }
  
-
-/* aio_ring_event: returns a pointer to the event at the given index from
- * kmap_atomic().  Release the pointer with put_aio_ring_event();
- */
  #define AIO_EVENTS_PER_PAGE    (PAGE_SIZE / sizeof(struct io_event))
  #define AIO_EVENTS_FIRST_PAGE  ((PAGE_SIZE - sizeof(struct aio_ring)) / sizeof(struct io_event))
  #define AIO_EVENTS_OFFSET      (AIO_EVENTS_PER_PAGE - AIO_EVENTS_FIRST_PAGE)
  
-#define aio_ring_event(info, nr) ({                                    \
-       unsigned pos = (nr) + AIO_EVENTS_OFFSET;                        \
-       struct io_event *__event;                                       \
-       __event = kmap_atomic(                                          \
-                       (info)->ring_pages[pos / AIO_EVENTS_PER_PAGE]); \
-       __event += pos % AIO_EVENTS_PER_PAGE;                           \
-       __event;                                                        \
-})
-
-#define put_aio_ring_event(event) do {         \
-       struct io_event *__event = (event);     \
-       (void)__event;                          \
-       kunmap_atomic((void *)((unsigned long)__event & PAGE_MASK)); \
-} while(0)
-
-static void ctx_rcu_free(struct rcu_head *head)
+void kiocb_set_cancel_fn(struct kiocb *req, kiocb_cancel_fn *cancel)
+{
+       struct kioctx *ctx = req->ki_ctx;
+       unsigned long flags;
+
+       spin_lock_irqsave(&ctx->ctx_lock, flags);
+
+       if (!req->ki_list.next)
+               list_add(&req->ki_list, &ctx->active_reqs);
+
+       req->ki_cancel = cancel;
+
+       spin_unlock_irqrestore(&ctx->ctx_lock, flags);
+}
+EXPORT_SYMBOL(kiocb_set_cancel_fn);
+
+static int kiocb_cancel(struct kioctx *ctx, struct kiocb *kiocb,
+                       struct io_event *res)
+{
+       kiocb_cancel_fn *old, *cancel;
+       int ret = -EINVAL;
+
+       /*
+        * Don't want to set kiocb->ki_cancel = KIOCB_CANCELLED unless it
+        * actually has a cancel function, hence the cmpxchg()
+        */
+
+       cancel = ACCESS_ONCE(kiocb->ki_cancel);
+       do {
+               if (!cancel || cancel == KIOCB_CANCELLED)
+                       return ret;
+
+               old = cancel;
+               cancel = cmpxchg(&kiocb->ki_cancel, old, KIOCB_CANCELLED);
+       } while (cancel != old);
+
+       atomic_inc(&kiocb->ki_users);
+       spin_unlock_irq(&ctx->ctx_lock);
+
+       memset(res, 0, sizeof(*res));
+       res->obj = (u64)(unsigned long)kiocb->ki_obj.user;
+       res->data = kiocb->ki_user_data;
+       ret = cancel(kiocb, res);
+
+       spin_lock_irq(&ctx->ctx_lock);
+
+       return ret;
+}
+
+static void free_ioctx_rcu(struct rcu_head *head)
  {
         struct kioctx *ctx = container_of(head, struct kioctx, rcu_head);
         kmem_cache_free(kioctx_cachep, ctx);
  }
  
-/* __put_ioctx
- *     Called when the last user of an aio context has gone away,
- *     and the struct needs to be freed.
+/*
+ * When this function runs, the kioctx has been removed from the "hash table"
+ * and ctx->users has dropped to 0, so we know no more kiocbs can be submitted -
+ * now it's safe to cancel any that need to be.
   */
-static void __put_ioctx(struct kioctx *ctx)
+static void free_ioctx(struct kioctx *ctx)
  {
-       unsigned nr_events = ctx->max_reqs;
-       BUG_ON(ctx->reqs_active);
+       struct aio_ring *ring;
+       struct io_event res;
+       struct kiocb *req;
+       unsigned head, avail;
  
-       cancel_delayed_work_sync(&ctx->wq);
-       aio_free_ring(ctx);
-       mmdrop(ctx->mm);
-       ctx->mm = NULL;
-       if (nr_events) {
-               spin_lock(&aio_nr_lock);
-               BUG_ON(aio_nr - nr_events > aio_nr);
-               aio_nr -= nr_events;
-               spin_unlock(&aio_nr_lock);
+       spin_lock_irq(&ctx->ctx_lock);
+
+       while (!list_empty(&ctx->active_reqs)) {
+               req = list_first_entry(&ctx->active_reqs,
+                                      struct kiocb, ki_list);
+
+               list_del_init(&req->ki_list);
+               kiocb_cancel(ctx, req, &res);
         }
-       pr_debug("__put_ioctx: freeing %p\n", ctx);
-       call_rcu(&ctx->rcu_head, ctx_rcu_free);
-}
  
-static inline int try_get_ioctx(struct kioctx *kioctx)
-{
-       return atomic_inc_not_zero(&kioctx->users);
+       spin_unlock_irq(&ctx->ctx_lock);
+
+       ring = kmap_atomic(ctx->ring_pages[0]);
+       head = ring->head;
+       kunmap_atomic(ring);
+
+       while (atomic_read(&ctx->reqs_active) > 0) {
+               wait_event(ctx->wait, head != ctx->tail);
+
+               avail = (head <= ctx->tail ? ctx->tail : ctx->nr_events) - head;
+
+               atomic_sub(avail, &ctx->reqs_active);
+               head += avail;
+               head %= ctx->nr_events;
+       }
+
+       WARN_ON(atomic_read(&ctx->reqs_active) < 0);
+
+       aio_free_ring(ctx);
+
+       spin_lock(&aio_nr_lock);
+       BUG_ON(aio_nr - ctx->max_reqs > aio_nr);
+       aio_nr -= ctx->max_reqs;
+       spin_unlock(&aio_nr_lock);
+
+       pr_debug("freeing %p\n", ctx);
+
+       /*
+        * Here the call_rcu() is between the wait_event() for reqs_active to
+        * hit 0, and freeing the ioctx.
+        *
+        * aio_complete() decrements reqs_active, but it has to touch the ioctx
+        * after to issue a wakeup so we use rcu.
+        */
+       call_rcu(&ctx->rcu_head, free_ioctx_rcu);
  }
  
-static inline void put_ioctx(struct kioctx *kioctx)
+static void put_ioctx(struct kioctx *ctx)
  {
-       BUG_ON(atomic_read(&kioctx->users) <= 0);
-       if (unlikely(atomic_dec_and_test(&kioctx->users)))
-               __put_ioctx(kioctx);
+       if (unlikely(atomic_dec_and_test(&ctx->users)))
+               free_ioctx(ctx);
  }
  
  /* ioctx_alloc
@@ -237,7 +348,7 @@ static inline void put_ioctx(struct kioctx *kioctx)
   */
  static struct kioctx *ioctx_alloc(unsigned nr_events)
  {
-       struct mm_struct *mm;
+       struct mm_struct *mm = current->mm;
         struct kioctx *ctx;
         int err = -ENOMEM;
  
@@ -256,17 +367,15 @@ static struct kioctx *ioctx_alloc(unsigned nr_events)
                 return ERR_PTR(-ENOMEM);
  
         ctx->max_reqs = nr_events;
-       mm = ctx->mm = current->mm;
-       atomic_inc(&mm->mm_count);
  
         atomic_set(&ctx->users, 2);
+       atomic_set(&ctx->dead, 0);
         spin_lock_init(&ctx->ctx_lock);
-       spin_lock_init(&ctx->ring_info.ring_lock);
+       spin_lock_init(&ctx->completion_lock);
+       mutex_init(&ctx->ring_lock);
         init_waitqueue_head(&ctx->wait);
  
         INIT_LIST_HEAD(&ctx->active_reqs);
-       INIT_LIST_HEAD(&ctx->run_list);
-       INIT_DELAYED_WORK(&ctx->wq, aio_kick_handler);
  
         if (aio_setup_ring(ctx) < 0)
                 goto out_freectx;
@@ -286,64 +395,56 @@ static struct kioctx *ioctx_alloc(unsigned nr_events)
         hlist_add_head_rcu(&ctx->list, &mm->ioctx_list);
         spin_unlock(&mm->ioctx_lock);
  
-       dprintk("aio: allocated ioctx %p[%ld]: mm=%p mask=0x%x\n",
-               ctx, ctx->user_id, current->mm, ctx->ring_info.nr);
+       pr_debug("allocated ioctx %p[%ld]: mm=%p mask=0x%x\n",
+                ctx, ctx->user_id, mm, ctx->nr_events);
         return ctx;
  
  out_cleanup:
         err = -EAGAIN;
         aio_free_ring(ctx);
  out_freectx:
-       mmdrop(mm);
         kmem_cache_free(kioctx_cachep, ctx);
-       dprintk("aio: error allocating ioctx %d\n", err);
+       pr_debug("error allocating ioctx %d\n", err);
         return ERR_PTR(err);
  }
  
-/* kill_ctx
- *     Cancels all outstanding aio requests on an aio context.  Used 
- *     when the processes owning a context have all exited to encourage 
- *     the rapid destruction of the kioctx.
- */
-static void kill_ctx(struct kioctx *ctx)
+static void kill_ioctx_work(struct work_struct *work)
  {
-       int (*cancel)(struct kiocb *, struct io_event *);
-       struct task_struct *tsk = current;
-       DECLARE_WAITQUEUE(wait, tsk);
-       struct io_event res;
+       struct kioctx *ctx = container_of(work, struct kioctx, rcu_work);
  
-       spin_lock_irq(&ctx->ctx_lock);
-       ctx->dead = 1;
-       while (!list_empty(&ctx->active_reqs)) {
-               struct list_head *pos = ctx->active_reqs.next;
-               struct kiocb *iocb = list_kiocb(pos);
-               list_del_init(&iocb->ki_list);
-               cancel = iocb->ki_cancel;
-               kiocbSetCancelled(iocb);
-               if (cancel) {
-                       iocb->ki_users++;
-                       spin_unlock_irq(&ctx->ctx_lock);
-                       cancel(iocb, &res);
-                       spin_lock_irq(&ctx->ctx_lock);
-               }
-       }
+       wake_up_all(&ctx->wait);
+       put_ioctx(ctx);
+}
  
-       if (!ctx->reqs_active)
-               goto out;
+static void kill_ioctx_rcu(struct rcu_head *head)
+{
+       struct kioctx *ctx = container_of(head, struct kioctx, rcu_head);
  
-       add_wait_queue(&ctx->wait, &wait);
-       set_task_state(tsk, TASK_UNINTERRUPTIBLE);
-       while (ctx->reqs_active) {
-               spin_unlock_irq(&ctx->ctx_lock);
-               io_schedule();
-               set_task_state(tsk, TASK_UNINTERRUPTIBLE);
-               spin_lock_irq(&ctx->ctx_lock);
-       }
-       __set_task_state(tsk, TASK_RUNNING);
-       remove_wait_queue(&ctx->wait, &wait);
+       INIT_WORK(&ctx->rcu_work, kill_ioctx_work);
+       schedule_work(&ctx->rcu_work);
+}
  
-out:
-       spin_unlock_irq(&ctx->ctx_lock);
+/* kill_ioctx
+ *     Cancels all outstanding aio requests on an aio context.  Used
+ *     when the processes owning a context have all exited to encourage
+ *     the rapid destruction of the kioctx.
+ */
+static void kill_ioctx(struct kioctx *ctx)
+{
+       if (!atomic_xchg(&ctx->dead, 1)) {
+               hlist_del_rcu(&ctx->list);
+               /* Between hlist_del_rcu() and dropping the initial ref */
+               synchronize_rcu();
+
+               /*
+                * We can't punt to workqueue here because put_ioctx() ->
+                * free_ioctx() will unmap the ringbuffer, and that has to be
+                * done in the original process's context. kill_ioctx_rcu/work()
+                * exist for exit_aio(), as in that path free_ioctx() won't do
+                * the unmap.
+                */
+               kill_ioctx_work(&ctx->rcu_work);
+       }
  }
  
  /* wait_on_sync_kiocb:
@@ -351,9 +452,9 @@ out:
   */
  ssize_t wait_on_sync_kiocb(struct kiocb *iocb)
  {
-       while (iocb->ki_users) {
+       while (atomic_read(&iocb->ki_users)) {
                 set_current_state(TASK_UNINTERRUPTIBLE);
-               if (!iocb->ki_users)
+               if (!atomic_read(&iocb->ki_users))
                         break;
                 io_schedule();
         }
@@ -362,28 +463,26 @@ ssize_t wait_on_sync_kiocb(struct kiocb *iocb)
  }
  EXPORT_SYMBOL(wait_on_sync_kiocb);
  
-/* exit_aio: called when the last user of mm goes away.  At this point, 
- * there is no way for any new requests to be submited or any of the 
- * io_* syscalls to be called on the context.  However, there may be 
- * outstanding requests which hold references to the context; as they 
- * go away, they will call put_ioctx and release any pinned memory
- * associated with the request (held via struct page * references).
+/*
+ * exit_aio: called when the last user of mm goes away.  At this point, there is
+ * no way for any new requests to be submited or any of the io_* syscalls to be
+ * called on the context.
+ *
+ * There may be outstanding kiocbs, but free_ioctx() will explicitly wait on
+ * them.
   */
  void exit_aio(struct mm_struct *mm)
  {
         struct kioctx *ctx;
+       struct hlist_node *n;
  
-       while (!hlist_empty(&mm->ioctx_list)) {
-               ctx = hlist_entry(mm->ioctx_list.first, struct kioctx, list);
-               hlist_del_rcu(&ctx->list);
-
-               kill_ctx(ctx);
-
+       hlist_for_each_entry_safe(ctx, n, &mm->ioctx_list, list) {
                 if (1 != atomic_read(&ctx->users))
                         printk(KERN_DEBUG
                                 "exit_aio:ioctx still alive: %d %d %d\n",
-                               atomic_read(&ctx->users), ctx->dead,
-                               ctx->reqs_active);
+                               atomic_read(&ctx->users),
+                               atomic_read(&ctx->dead),
+                               atomic_read(&ctx->reqs_active));
                 /*
                  * We don't need to bother with munmap() here -
                  * exit_mmap(mm) is coming and it'll unmap everything.
@@ -391,150 +490,53 @@ void exit_aio(struct mm_struct *mm)
                  * as indicator that it needs to unmap the area,
                  * just set it to 0; aio_free_ring() is the only
                  * place that uses ->mmap_size, so it's safe.
-                * That way we get all munmap done to current->mm -
-                * all other callers have ctx->mm == current->mm.
                  */
-               ctx->ring_info.mmap_size = 0;
-               put_ioctx(ctx);
+               ctx->mmap_size = 0;
+
+               if (!atomic_xchg(&ctx->dead, 1)) {
+                       hlist_del_rcu(&ctx->list);
+                       call_rcu(&ctx->rcu_head, kill_ioctx_rcu);
+               }
         }
  }
  
  /* aio_get_req
- *     Allocate a slot for an aio request.  Increments the users count
+ *     Allocate a slot for an aio request.  Increments the ki_users count
   * of the kioctx so that the kioctx stays around until all requests are
   * complete.  Returns NULL if no requests are free.
   *
- * Returns with kiocb->users set to 2.  The io submit code path holds
+ * Returns with kiocb->ki_users set to 2.  The io submit code path holds
   * an extra reference while submitting the i/o.
   * This prevents races between the aio code path referencing the
   * req (after submitting it) and aio_complete() freeing the req.
   */
-static struct kiocb *__aio_get_req(struct kioctx *ctx)
+static inline struct kiocb *aio_get_req(struct kioctx *ctx)
  {
-       struct kiocb *req = NULL;
+       struct kiocb *req;
  
-       req = kmem_cache_alloc(kiocb_cachep, GFP_KERNEL);
-       if (unlikely(!req))
+       if (atomic_read(&ctx->reqs_active) >= ctx->nr_events)
                 return NULL;
  
-       req->ki_flags = 0;
-       req->ki_users = 2;
-       req->ki_key = 0;
-       req->ki_ctx = ctx;
-       req->ki_cancel = NULL;
-       req->ki_retry = NULL;
-       req->ki_dtor = NULL;
-       req->private = NULL;
-       req->ki_iovec = NULL;
-       INIT_LIST_HEAD(&req->ki_run_list);
-       req->ki_eventfd = NULL;
-
-       return req;
-}
-
-/*
- * struct kiocb's are allocated in batches to reduce the number of
- * times the ctx lock is acquired and released.
- */
-#define KIOCB_BATCH_SIZE       32L
-struct kiocb_batch {
-       struct list_head head;
-       long count; /* number of requests left to allocate */
-};
-
-static void kiocb_batch_init(struct kiocb_batch *batch, long total)
-{
-       INIT_LIST_HEAD(&batch->head);
-       batch->count = total;
-}
-
-static void kiocb_batch_free(struct kioctx *ctx, struct kiocb_batch *batch)
-{
-       struct kiocb *req, *n;
-
-       if (list_empty(&batch->head))
-               return;
-
-       spin_lock_irq(&ctx->ctx_lock);
-       list_for_each_entry_safe(req, n, &batch->head, ki_batch) {
-               list_del(&req->ki_batch);
-               list_del(&req->ki_list);
-               kmem_cache_free(kiocb_cachep, req);
-               ctx->reqs_active--;
-       }
-       if (unlikely(!ctx->reqs_active && ctx->dead))
-               wake_up_all(&ctx->wait);
-       spin_unlock_irq(&ctx->ctx_lock);
-}
-
-/*
- * Allocate a batch of kiocbs.  This avoids taking and dropping the
- * context lock a lot during setup.
- */
-static int kiocb_batch_refill(struct kioctx *ctx, struct kiocb_batch *batch)
-{
-       unsigned short allocated, to_alloc;
-       long avail;
-       struct kiocb *req, *n;
-       struct aio_ring *ring;
-
-       to_alloc = min(batch->count, KIOCB_BATCH_SIZE);
-       for (allocated = 0; allocated < to_alloc; allocated++) {
-               req = __aio_get_req(ctx);
-               if (!req)
-                       /* allocation failed, go with what we've got */
-                       break;
-               list_add(&req->ki_batch, &batch->head);
-       }
-
-       if (allocated == 0)
-               goto out;
-
-       spin_lock_irq(&ctx->ctx_lock);
-       ring = kmap_atomic(ctx->ring_info.ring_pages[0]);
-
-       avail = aio_ring_avail(&ctx->ring_info, ring) - ctx->reqs_active;
-       BUG_ON(avail < 0);
-       if (avail < allocated) {
-               /* Trim back the number of requests. */
-               list_for_each_entry_safe(req, n, &batch->head, ki_batch) {
-                       list_del(&req->ki_batch);
-                       kmem_cache_free(kiocb_cachep, req);
-                       if (--allocated <= avail)
-                               break;
-               }
-       }
-
-       batch->count -= allocated;
-       list_for_each_entry(req, &batch->head, ki_batch) {
-               list_add(&req->ki_list, &ctx->active_reqs);
-               ctx->reqs_active++;
-       }
+       if (atomic_inc_return(&ctx->reqs_active) > ctx->nr_events - 1)
+               goto out_put;
  
-       kunmap_atomic(ring);
-       spin_unlock_irq(&ctx->ctx_lock);
-
-out:
-       return allocated;
-}
+       req = kmem_cache_alloc(kiocb_cachep, GFP_KERNEL|__GFP_ZERO);
+       if (unlikely(!req))
+               goto out_put;
  
-static inline struct kiocb *aio_get_req(struct kioctx *ctx,
-                                       struct kiocb_batch *batch)
-{
-       struct kiocb *req;
+       atomic_set(&req->ki_users, 2);
+       req->ki_ctx = ctx;
  
-       if (list_empty(&batch->head))
-               if (kiocb_batch_refill(ctx, batch) == 0)
-                       return NULL;
-       req = list_first_entry(&batch->head, struct kiocb, ki_batch);
-       list_del(&req->ki_batch);
         return req;
+out_put:
+       atomic_dec(&ctx->reqs_active);
+       return NULL;
  }
  
-static inline void really_put_req(struct kioctx *ctx, struct kiocb *req)
+static void kiocb_free(struct kiocb *req)
  {
-       assert_spin_locked(&ctx->ctx_lock);
-
+       if (req->ki_filp)
+               fput(req->ki_filp);
         if (req->ki_eventfd != NULL)
                 eventfd_ctx_put(req->ki_eventfd);
         if (req->ki_dtor)
@@ -542,48 +544,12 @@ static inline void really_put_req(struct kioctx *ctx, struct kiocb *req)
         if (req->ki_iovec != &req->ki_inline_vec)
                 kfree(req->ki_iovec);
         kmem_cache_free(kiocb_cachep, req);
-       ctx->reqs_active--;
-
-       if (unlikely(!ctx->reqs_active && ctx->dead))
-               wake_up_all(&ctx->wait);
  }
  
-/* __aio_put_req
- *     Returns true if this put was the last user of the request.
- */
-static int __aio_put_req(struct kioctx *ctx, struct kiocb *req)
+void aio_put_req(struct kiocb *req)
  {
-       dprintk(KERN_DEBUG "aio_put(%p): f_count=%ld\n",
-               req, atomic_long_read(&req->ki_filp->f_count));
-
-       assert_spin_locked(&ctx->ctx_lock);
-
-       req->ki_users--;
-       BUG_ON(req->ki_users < 0);
-       if (likely(req->ki_users))
-               return 0;
-       list_del(&req->ki_list);                /* remove from active_reqs */
-       req->ki_cancel = NULL;
-       req->ki_retry = NULL;
-
-       fput(req->ki_filp);
-       req->ki_filp = NULL;
-       really_put_req(ctx, req);
-       return 1;
-}
-
-/* aio_put_req
- *     Returns true if this put was the last user of the kiocb,
- *     false if the request is still in use.
- */
-int aio_put_req(struct kiocb *req)
-{
-       struct kioctx *ctx = req->ki_ctx;
-       int ret;
-       spin_lock_irq(&ctx->ctx_lock);
-       ret = __aio_put_req(ctx, req);
-       spin_unlock_irq(&ctx->ctx_lock);
-       return ret;
+       if (atomic_dec_and_test(&req->ki_users))
+               kiocb_free(req);
  }
  EXPORT_SYMBOL(aio_put_req);
  
@@ -595,13 +561,8 @@ static struct kioctx *lookup_ioctx(unsigned long ctx_id)
         rcu_read_lock();
  
         hlist_for_each_entry_rcu(ctx, &mm->ioctx_list, list) {
-               /*
-                * RCU protects us against accessing freed memory but
-                * we have to be careful not to get a reference when the
-                * reference count already dropped to 0 (ctx->dead test
-                * is unreliable because of races).
-                */
-               if (ctx->user_id == ctx_id && !ctx->dead && try_get_ioctx(ctx)){
+               if (ctx->user_id == ctx_id) {
+                       atomic_inc(&ctx->users);
                         ret = ctx;
                         break;
                 }
@@ -611,295 +572,16 @@ static struct kioctx *lookup_ioctx(unsigned long ctx_id)
         return ret;
  }
  
-/*
- * Queue up a kiocb to be retried. Assumes that the kiocb
- * has already been marked as kicked, and places it on
- * the retry run list for the corresponding ioctx, if it
- * isn't already queued. Returns 1 if it actually queued
- * the kiocb (to tell the caller to activate the work
- * queue to process it), or 0, if it found that it was
- * already queued.
- */
-static inline int __queue_kicked_iocb(struct kiocb *iocb)
-{
-       struct kioctx *ctx = iocb->ki_ctx;
-
-       assert_spin_locked(&ctx->ctx_lock);
-
-       if (list_empty(&iocb->ki_run_list)) {
-               list_add_tail(&iocb->ki_run_list,
-                       &ctx->run_list);
-               return 1;
-       }
-       return 0;
-}
-
-/* aio_run_iocb
- *     This is the core aio execution routine. It is
- *     invoked both for initial i/o submission and
- *     subsequent retries via the aio_kick_handler.
- *     Expects to be invoked with iocb->ki_ctx->lock
- *     already held. The lock is released and reacquired
- *     as needed during processing.
- *
- * Calls the iocb retry method (already setup for the
- * iocb on initial submission) for operation specific
- * handling, but takes care of most of common retry
- * execution details for a given iocb. The retry method
- * needs to be non-blocking as far as possible, to avoid
- * holding up other iocbs waiting to be serviced by the
- * retry kernel thread.
- *
- * The trickier parts in this code have to do with
- * ensuring that only one retry instance is in progress
- * for a given iocb at any time. Providing that guarantee
- * simplifies the coding of individual aio operations as
- * it avoids various potential races.
- */
-static ssize_t aio_run_iocb(struct kiocb *iocb)
-{
-       struct kioctx   *ctx = iocb->ki_ctx;
-       ssize_t (*retry)(struct kiocb *);
-       ssize_t ret;
-
-       if (!(retry = iocb->ki_retry)) {
-               printk("aio_run_iocb: iocb->ki_retry = NULL\n");
-               return 0;
-       }
-
-       /*
-        * We don't want the next retry iteration for this
-        * operation to start until this one has returned and
-        * updated the iocb state. However, wait_queue functions
-        * can trigger a kick_iocb from interrupt context in the
-        * meantime, indicating that data is available for the next
-        * iteration. We want to remember that and enable the
-        * next retry iteration _after_ we are through with
-        * this one.
-        *
-        * So, in order to be able to register a "kick", but
-        * prevent it from being queued now, we clear the kick
-        * flag, but make the kick code *think* that the iocb is
-        * still on the run list until we are actually done.
-        * When we are done with this iteration, we check if
-        * the iocb was kicked in the meantime and if so, queue
-        * it up afresh.
-        */
-
-       kiocbClearKicked(iocb);
-
-       /*
-        * This is so that aio_complete knows it doesn't need to
-        * pull the iocb off the run list (We can't just call
-        * INIT_LIST_HEAD because we don't want a kick_iocb to
-        * queue this on the run list yet)
-        */
-       iocb->ki_run_list.next = iocb->ki_run_list.prev = NULL;
-       spin_unlock_irq(&ctx->ctx_lock);
-
-       /* Quit retrying if the i/o has been cancelled */
-       if (kiocbIsCancelled(iocb)) {
-               ret = -EINTR;
-               aio_complete(iocb, ret, 0);
-               /* must not access the iocb after this */
-               goto out;
-       }
-
-       /*
-        * Now we are all set to call the retry method in async
-        * context.
-        */
-       ret = retry(iocb);
-
-       if (ret != -EIOCBRETRY && ret != -EIOCBQUEUED) {
-               /*
-                * There's no easy way to restart the syscall since other AIO's
-                * may be already running. Just fail this IO with EINTR.
-                */
-               if (unlikely(ret == -ERESTARTSYS || ret == -ERESTARTNOINTR ||
-                            ret == -ERESTARTNOHAND || ret == -ERESTART_RESTARTBLOCK))
-                       ret = -EINTR;
-               aio_complete(iocb, ret, 0);
-       }
-out:
-       spin_lock_irq(&ctx->ctx_lock);
-
-       if (-EIOCBRETRY == ret) {
-               /*
-                * OK, now that we are done with this iteration
-                * and know that there is more left to go,
-                * this is where we let go so that a subsequent
-                * "kick" can start the next iteration
-                */
-
-               /* will make __queue_kicked_iocb succeed from here on */
-               INIT_LIST_HEAD(&iocb->ki_run_list);
-               /* we must queue the next iteration ourselves, if it
-                * has already been kicked */
-               if (kiocbIsKicked(iocb)) {
-                       __queue_kicked_iocb(iocb);
-
-                       /*
-                        * __queue_kicked_iocb will always return 1 here, because
-                        * iocb->ki_run_list is empty at this point so it should
-                        * be safe to unconditionally queue the context into the
-                        * work queue.
-                        */
-                       aio_queue_work(ctx);
-               }
-       }
-       return ret;
-}
-
-/*
- * __aio_run_iocbs:
- *     Process all pending retries queued on the ioctx
- *     run list.
- * Assumes it is operating within the aio issuer's mm
- * context.
- */
-static int __aio_run_iocbs(struct kioctx *ctx)
-{
-       struct kiocb *iocb;
-       struct list_head run_list;
-
-       assert_spin_locked(&ctx->ctx_lock);
-
-       list_replace_init(&ctx->run_list, &run_list);
-       while (!list_empty(&run_list)) {
-               iocb = list_entry(run_list.next, struct kiocb,
-                       ki_run_list);
-               list_del(&iocb->ki_run_list);
-               /*
-                * Hold an extra reference while retrying i/o.
-                */
-               iocb->ki_users++;       /* grab extra reference */
-               aio_run_iocb(iocb);
-               __aio_put_req(ctx, iocb);
-       }
-       if (!list_empty(&ctx->run_list))
-               return 1;
-       return 0;
-}
-
-static void aio_queue_work(struct kioctx * ctx)
-{
-       unsigned long timeout;
-       /*
-        * if someone is waiting, get the work started right
-        * away, otherwise, use a longer delay
-        */
-       smp_mb();
-       if (waitqueue_active(&ctx->wait))
-               timeout = 1;
-       else
-               timeout = HZ/10;
-       queue_delayed_work(aio_wq, &ctx->wq, timeout);
-}
-
-/*
- * aio_run_all_iocbs:
- *     Process all pending retries queued on the ioctx
- *     run list, and keep running them until the list
- *     stays empty.
- * Assumes it is operating within the aio issuer's mm context.
- */
-static inline void aio_run_all_iocbs(struct kioctx *ctx)
-{
-       spin_lock_irq(&ctx->ctx_lock);
-       while (__aio_run_iocbs(ctx))
-               ;
-       spin_unlock_irq(&ctx->ctx_lock);
-}
-
-/*
- * aio_kick_handler:
- *     Work queue handler triggered to process pending
- *     retries on an ioctx. Takes on the aio issuer's
- *     mm context before running the iocbs, so that
- *     copy_xxx_user operates on the issuer's address
- *      space.
- * Run on aiod's context.
- */
-static void aio_kick_handler(struct work_struct *work)
-{
-       struct kioctx *ctx = container_of(work, struct kioctx, wq.work);
-       mm_segment_t oldfs = get_fs();
-       struct mm_struct *mm;
-       int requeue;
-
-       set_fs(USER_DS);
-       use_mm(ctx->mm);
-       spin_lock_irq(&ctx->ctx_lock);
-       requeue =__aio_run_iocbs(ctx);
-       mm = ctx->mm;
-       spin_unlock_irq(&ctx->ctx_lock);
-       unuse_mm(mm);
-       set_fs(oldfs);
-       /*
-        * we're in a worker thread already; no point using non-zero delay
-        */
-       if (requeue)
-               queue_delayed_work(aio_wq, &ctx->wq, 0);
-}
-
-
-/*
- * Called by kick_iocb to queue the kiocb for retry
- * and if required activate the aio work queue to process
- * it
- */
-static void try_queue_kicked_iocb(struct kiocb *iocb)
-{
-       struct kioctx   *ctx = iocb->ki_ctx;
-       unsigned long flags;
-       int run = 0;
-
-       spin_lock_irqsave(&ctx->ctx_lock, flags);
-       /* set this inside the lock so that we can't race with aio_run_iocb()
-        * testing it and putting the iocb on the run list under the lock */
-       if (!kiocbTryKick(iocb))
-               run = __queue_kicked_iocb(iocb);
-       spin_unlock_irqrestore(&ctx->ctx_lock, flags);
-       if (run)
-               aio_queue_work(ctx);
-}
-
-/*
- * kick_iocb:
- *      Called typically from a wait queue callback context
- *      to trigger a retry of the iocb.
- *      The retry is usually executed by aio workqueue
- *      threads (See aio_kick_handler).
- */
-void kick_iocb(struct kiocb *iocb)
-{
-       /* sync iocbs are easy: they can only ever be executing from a 
-        * single context. */
-       if (is_sync_kiocb(iocb)) {
-               kiocbSetKicked(iocb);
-               wake_up_process(iocb->ki_obj.tsk);
-               return;
-       }
-
-       try_queue_kicked_iocb(iocb);
-}
-EXPORT_SYMBOL(kick_iocb);
-
  /* aio_complete
   *     Called when the io request on the given iocb is complete.
- *     Returns true if this is the last user of the request.  The 
- *     only other user of the request can be the cancellation code.
   */
-int aio_complete(struct kiocb *iocb, long res, long res2)
+void aio_complete(struct kiocb *iocb, long res, long res2)
  {
         struct kioctx   *ctx = iocb->ki_ctx;
-       struct aio_ring_info    *info;
         struct aio_ring *ring;
-       struct io_event *event;
+       struct io_event *ev_page, *event;
         unsigned long   flags;
-       unsigned long   tail;
-       int             ret;
+       unsigned tail, pos;
  
         /*
          * Special case handling for sync iocbs:
@@ -909,61 +591,81 @@ int aio_complete(struct kiocb *iocb, long res, long res2)
          *  - the sync task helpfully left a reference to itself in the iocb
          */
         if (is_sync_kiocb(iocb)) {
-               BUG_ON(iocb->ki_users != 1);
+               BUG_ON(atomic_read(&iocb->ki_users) != 1);
                 iocb->ki_user_data = res;
-               iocb->ki_users = 0;
+               atomic_set(&iocb->ki_users, 0);
                 wake_up_process(iocb->ki_obj.tsk);
-               return 1;
+               return;
         }
  
-       info = &ctx->ring_info;
-
-       /* add a completion event to the ring buffer.
-        * must be done holding ctx->ctx_lock to prevent
-        * other code from messing with the tail
-        * pointer since we might be called from irq
-        * context.
+       /*
+        * Take rcu_read_lock() in case the kioctx is being destroyed, as we
+        * need to issue a wakeup after decrementing reqs_active.
          */
-       spin_lock_irqsave(&ctx->ctx_lock, flags);
+       rcu_read_lock();
  
-       if (iocb->ki_run_list.prev && !list_empty(&iocb->ki_run_list))
-               list_del_init(&iocb->ki_run_list);
+       if (iocb->ki_list.next) {
+               unsigned long flags;
+
+               spin_lock_irqsave(&ctx->ctx_lock, flags);
+               list_del(&iocb->ki_list);
+               spin_unlock_irqrestore(&ctx->ctx_lock, flags);
+       }
  
         /*
          * cancelled requests don't get events, userland was given one
          * when the event got cancelled.
          */
-       if (kiocbIsCancelled(iocb))
+       if (unlikely(xchg(&iocb->ki_cancel,
+                         KIOCB_CANCELLED) == KIOCB_CANCELLED)) {
+               atomic_dec(&ctx->reqs_active);
+               /* Still need the wake_up in case free_ioctx is waiting */
                 goto put_rq;
+       }
  
-       ring = kmap_atomic(info->ring_pages[0]);
+       /*
+        * Add a completion event to the ring buffer. Must be done holding
+        * ctx->ctx_lock to prevent other code from messing with the tail
+        * pointer since we might be called from irq context.
+        */
+       spin_lock_irqsave(&ctx->completion_lock, flags);
  
-       tail = info->tail;
-       event = aio_ring_event(info, tail);
-       if (++tail >= info->nr)
+       tail = ctx->tail;
+       pos = tail + AIO_EVENTS_OFFSET;
+
+       if (++tail >= ctx->nr_events)
                 tail = 0;
  
+       ev_page = kmap_atomic(ctx->ring_pages[pos / AIO_EVENTS_PER_PAGE]);
+       event = ev_page + pos % AIO_EVENTS_PER_PAGE;
+
         event->obj = (u64)(unsigned long)iocb->ki_obj.user;
         event->data = iocb->ki_user_data;
         event->res = res;
         event->res2 = res2;
  
-       dprintk("aio_complete: %p[%lu]: %p: %p %Lx %lx %lx\n",
-               ctx, tail, iocb, iocb->ki_obj.user, iocb->ki_user_data,
-               res, res2);
+       kunmap_atomic(ev_page);
+       flush_dcache_page(ctx->ring_pages[pos / AIO_EVENTS_PER_PAGE]);
+
+       pr_debug("%p[%u]: %p: %p %Lx %lx %lx\n",
+                ctx, tail, iocb, iocb->ki_obj.user, iocb->ki_user_data,
+                res, res2);
  
         /* after flagging the request as done, we
          * must never even look at it again
          */
         smp_wmb();      /* make event visible before updating tail */
  
-       info->tail = tail;
-       ring->tail = tail;
+       ctx->tail = tail;
  
-       put_aio_ring_event(event);
+       ring = kmap_atomic(ctx->ring_pages[0]);
+       ring->tail = tail;
         kunmap_atomic(ring);
+       flush_dcache_page(ctx->ring_pages[0]);
+
+       spin_unlock_irqrestore(&ctx->completion_lock, flags);
  
-       pr_debug("added to ring %p at [%lu]\n", iocb, tail);
+       pr_debug("added to ring %p at [%u]\n", iocb, tail);
  
         /*
          * Check if the user asked us to deliver the result through an
@@ -975,7 +677,7 @@ int aio_complete(struct kiocb *iocb, long res, long res2)
  
  put_rq:
         /* everything turned out well, dispose of the aiocb. */
-       ret = __aio_put_req(ctx, iocb);
+       aio_put_req(iocb);
  
         /*
          * We have to order our ring_info tail store above and test
@@ -988,233 +690,133 @@ put_rq:
         if (waitqueue_active(&ctx->wait))
                 wake_up(&ctx->wait);
  
-       spin_unlock_irqrestore(&ctx->ctx_lock, flags);
-       return ret;
+       rcu_read_unlock();
  }
  EXPORT_SYMBOL(aio_complete);
  
-/* aio_read_evt
- *     Pull an event off of the ioctx's event ring.  Returns the number of 
- *     events fetched (0 or 1 ;-)
- *     FIXME: make this use cmpxchg.
- *     TODO: make the ringbuffer user mmap()able (requires FIXME).
+/* aio_read_events
+ *     Pull an event off of the ioctx's event ring.  Returns the number of
+ *     events fetched
   */
-static int aio_read_evt(struct kioctx *ioctx, struct io_event *ent)
+static long aio_read_events_ring(struct kioctx *ctx,
+                                struct io_event __user *event, long nr)
  {
-       struct aio_ring_info *info = &ioctx->ring_info;
         struct aio_ring *ring;
-       unsigned long head;
-       int ret = 0;
-
-       ring = kmap_atomic(info->ring_pages[0]);
-       dprintk("in aio_read_evt h%lu t%lu m%lu\n",
-                (unsigned long)ring->head, (unsigned long)ring->tail,
-                (unsigned long)ring->nr);
-
-       if (ring->head == ring->tail)
-               goto out;
+       unsigned head, pos;
+       long ret = 0;
+       int copy_ret;
  
-       spin_lock(&info->ring_lock);
-
-       head = ring->head % info->nr;
-       if (head != ring->tail) {
-               struct io_event *evp = aio_ring_event(info, head);
-               *ent = *evp;
-               head = (head + 1) % info->nr;
-               smp_mb(); /* finish reading the event before updatng the head */
-               ring->head = head;
-               ret = 1;
-               put_aio_ring_event(evp);
-       }
-       spin_unlock(&info->ring_lock);
+       mutex_lock(&ctx->ring_lock);
  
-out:
-       dprintk("leaving aio_read_evt: %d  h%lu t%lu\n", ret,
-                (unsigned long)ring->head, (unsigned long)ring->tail);
+       ring = kmap_atomic(ctx->ring_pages[0]);
+       head = ring->head;
         kunmap_atomic(ring);
-       return ret;
-}
  
-struct aio_timeout {
-       struct timer_list       timer;
-       int                     timed_out;
-       struct task_struct      *p;
-};
+       pr_debug("h%u t%u m%u\n", head, ctx->tail, ctx->nr_events);
  
-static void timeout_func(unsigned long data)
-{
-       struct aio_timeout *to = (struct aio_timeout *)data;
+       if (head == ctx->tail)
+               goto out;
  
-       to->timed_out = 1;
-       wake_up_process(to->p);
-}
+       while (ret < nr) {
+               long avail;
+               struct io_event *ev;
+               struct page *page;
  
-static inline void init_timeout(struct aio_timeout *to)
-{
-       setup_timer_on_stack(&to->timer, timeout_func, (unsigned long) to);
-       to->timed_out = 0;
-       to->p = current;
-}
+               avail = (head <= ctx->tail ? ctx->tail : ctx->nr_events) - head;
+               if (head == ctx->tail)
+                       break;
  
-static inline void set_timeout(long start_jiffies, struct aio_timeout *to,
-                              const struct timespec *ts)
-{
-       to->timer.expires = start_jiffies + timespec_to_jiffies(ts);
-       if (time_after(to->timer.expires, jiffies))
-               add_timer(&to->timer);
-       else
-               to->timed_out = 1;
-}
+               avail = min(avail, nr - ret);
+               avail = min_t(long, avail, AIO_EVENTS_PER_PAGE -
+                           ((head + AIO_EVENTS_OFFSET) % AIO_EVENTS_PER_PAGE));
  
-static inline void clear_timeout(struct aio_timeout *to)
-{
-       del_singleshot_timer_sync(&to->timer);
-}
+               pos = head + AIO_EVENTS_OFFSET;
+               page = ctx->ring_pages[pos / AIO_EVENTS_PER_PAGE];
+               pos %= AIO_EVENTS_PER_PAGE;
  
-static int read_events(struct kioctx *ctx,
-                       long min_nr, long nr,
-                       struct io_event __user *event,
-                       struct timespec __user *timeout)
-{
-       long                    start_jiffies = jiffies;
-       struct task_struct      *tsk = current;
-       DECLARE_WAITQUEUE(wait, tsk);
-       int                     ret;
-       int                     i = 0;
-       struct io_event         ent;
-       struct aio_timeout      to;
-       int                     retry = 0;
-
-       /* needed to zero any padding within an entry (there shouldn't be 
-        * any, but C is fun!
-        */
-       memset(&ent, 0, sizeof(ent));
-retry:
-       ret = 0;
-       while (likely(i < nr)) {
-               ret = aio_read_evt(ctx, &ent);
-               if (unlikely(ret <= 0))
-                       break;
-
-               dprintk("read event: %Lx %Lx %Lx %Lx\n",
-                       ent.data, ent.obj, ent.res, ent.res2);
+               ev = kmap(page);
+               copy_ret = copy_to_user(event + ret, ev + pos,
+                                       sizeof(*ev) * avail);
+               kunmap(page);
  
-               /* Could we split the check in two? */
-               ret = -EFAULT;
-               if (unlikely(copy_to_user(event, &ent, sizeof(ent)))) {
-                       dprintk("aio: lost an event due to EFAULT.\n");
-                       break;
+               if (unlikely(copy_ret)) {
+                       ret = -EFAULT;
+                       goto out;
                 }
-               ret = 0;
  
-               /* Good, event copied to userland, update counts. */
-               event ++;
-               i ++;
+               ret += avail;
+               head += avail;
+               head %= ctx->nr_events;
         }
  
-       if (min_nr <= i)
-               return i;
-       if (ret)
-               return ret;
+       ring = kmap_atomic(ctx->ring_pages[0]);
+       ring->head = head;
+       kunmap_atomic(ring);
+       flush_dcache_page(ctx->ring_pages[0]);
  
-       /* End fast path */
+       pr_debug("%li  h%u t%u\n", ret, head, ctx->tail);
  
-       /* racey check, but it gets redone */
-       if (!retry && unlikely(!list_empty(&ctx->run_list))) {
-               retry = 1;
-               aio_run_all_iocbs(ctx);
-               goto retry;
-       }
+       atomic_sub(ret, &ctx->reqs_active);
+out:
+       mutex_unlock(&ctx->ring_lock);
  
-       init_timeout(&to);
-       if (timeout) {
-               struct timespec ts;
-               ret = -EFAULT;
-               if (unlikely(copy_from_user(&ts, timeout, sizeof(ts))))
-                       goto out;
+       return ret;
+}
  
-               set_timeout(start_jiffies, &to, &ts);
-       }
+static bool aio_read_events(struct kioctx *ctx, long min_nr, long nr,
+                           struct io_event __user *event, long *i)
+{
+       long ret = aio_read_events_ring(ctx, event + *i, nr - *i);
  
-       while (likely(i < nr)) {
-               add_wait_queue_exclusive(&ctx->wait, &wait);
-               do {
-                       set_task_state(tsk, TASK_INTERRUPTIBLE);
-                       ret = aio_read_evt(ctx, &ent);
-                       if (ret)
-                               break;
-                       if (min_nr <= i)
-                               break;
-                       if (unlikely(ctx->dead)) {
-                               ret = -EINVAL;
-                               break;
-                       }
-                       if (to.timed_out)       /* Only check after read evt */
-                               break;
-                       /* Try to only show up in io wait if there are ops
-                        *  in flight */
-                       if (ctx->reqs_active)
-                               io_schedule();
-                       else
-                               schedule();
-                       if (signal_pending(tsk)) {
-                               ret = -EINTR;
-                               break;
-                       }
-                       /*ret = aio_read_evt(ctx, &ent);*/
-               } while (1) ;
-
-               set_task_state(tsk, TASK_RUNNING);
-               remove_wait_queue(&ctx->wait, &wait);
-
-               if (unlikely(ret <= 0))
-                       break;
+       if (ret > 0)
+               *i += ret;
  
-               ret = -EFAULT;
-               if (unlikely(copy_to_user(event, &ent, sizeof(ent)))) {
-                       dprintk("aio: lost an event due to EFAULT.\n");
-                       break;
-               }
+       if (unlikely(atomic_read(&ctx->dead)))
+               ret = -EINVAL;
  
-               /* Good, event copied to userland, update counts. */
-               event ++;
-               i ++;
-       }
+       if (!*i)
+               *i = ret;
  
-       if (timeout)
-               clear_timeout(&to);
-out:
-       destroy_timer_on_stack(&to.timer);
-       return i ? i : ret;
+       return ret < 0 || *i >= min_nr;
  }
  
-/* Take an ioctx and remove it from the list of ioctx's.  Protects 
- * against races with itself via ->dead.
- */
-static void io_destroy(struct kioctx *ioctx)
+static long read_events(struct kioctx *ctx, long min_nr, long nr,
+                       struct io_event __user *event,
+                       struct timespec __user *timeout)
  {
-       struct mm_struct *mm = current->mm;
-       int was_dead;
+       ktime_t until = { .tv64 = KTIME_MAX };
+       long ret = 0;
  
-       /* delete the entry from the list is someone else hasn't already */
-       spin_lock(&mm->ioctx_lock);
-       was_dead = ioctx->dead;
-       ioctx->dead = 1;
-       hlist_del_rcu(&ioctx->list);
-       spin_unlock(&mm->ioctx_lock);
+       if (timeout) {
+               struct timespec ts;
  
-       dprintk("aio_release(%p)\n", ioctx);
-       if (likely(!was_dead))
-               put_ioctx(ioctx);       /* twice for the list */
+               if (unlikely(copy_from_user(&ts, timeout, sizeof(ts))))
+                       return -EFAULT;
  
-       kill_ctx(ioctx);
+               until = timespec_to_ktime(ts);
+       }
  
         /*
-        * Wake up any waiters.  The setting of ctx->dead must be seen
-        * by other CPUs at this point.  Right now, we rely on the
-        * locking done by the above calls to ensure this consistency.
+        * Note that aio_read_events() is being called as the conditional - i.e.
+        * we're calling it after prepare_to_wait() has set task state to
+        * TASK_INTERRUPTIBLE.
+        *
+        * But aio_read_events() can block, and if it blocks it's going to flip
+        * the task state back to TASK_RUNNING.
+        *
+        * This should be ok, provided it doesn't flip the state back to
+        * TASK_RUNNING and return 0 too much - that causes us to spin. That
+        * will only happen if the mutex_lock() call blocks, and we then find
+        * the ringbuffer empty. So in practice we should be ok, but it's
+        * something to be aware of when touching this code.
          */
-       wake_up_all(&ioctx->wait);
+       wait_event_interruptible_hrtimeout(ctx->wait,
+                       aio_read_events(ctx, min_nr, nr, event, &ret), until);
+
+       if (!ret && signal_pending(current))
+               ret = -EINTR;
+
+       return ret;
  }
  
  /* sys_io_setup:
@@ -1252,7 +854,7 @@ SYSCALL_DEFINE2(io_setup, unsigned, nr_events, aio_context_t __user *, ctxp)
         if (!IS_ERR(ioctx)) {
                 ret = put_user(ioctx->user_id, ctxp);
                 if (ret)
-                       io_destroy(ioctx);
+                       kill_ioctx(ioctx);
                 put_ioctx(ioctx);
         }
  
@@ -1270,7 +872,7 @@ SYSCALL_DEFINE1(io_destroy, aio_context_t, ctx)
  {
         struct kioctx *ioctx = lookup_ioctx(ctx);
         if (likely(NULL != ioctx)) {
-               io_destroy(ioctx);
+               kill_ioctx(ioctx);
                 put_ioctx(ioctx);
                 return 0;
         }
@@ -1301,30 +903,21 @@ static void aio_advance_iovec(struct kiocb *iocb, ssize_t ret)
         BUG_ON(ret > 0 && iocb->ki_left == 0);
  }
  
-static ssize_t aio_rw_vect_retry(struct kiocb *iocb)
+typedef ssize_t (aio_rw_op)(struct kiocb *, const struct iovec *,
+                           unsigned long, loff_t);
+
+static ssize_t aio_rw_vect_retry(struct kiocb *iocb, int rw, aio_rw_op *rw_op)
  {
         struct file *file = iocb->ki_filp;
         struct address_space *mapping = file->f_mapping;
         struct inode *inode = mapping->host;
-       ssize_t (*rw_op)(struct kiocb *, const struct iovec *,
-                        unsigned long, loff_t);
         ssize_t ret = 0;
-       unsigned short opcode;
-
-       if ((iocb->ki_opcode == IOCB_CMD_PREADV) ||
-               (iocb->ki_opcode == IOCB_CMD_PREAD)) {
-               rw_op = file->f_op->aio_read;
-               opcode = IOCB_CMD_PREADV;
-       } else {
-               rw_op = file->f_op->aio_write;
-               opcode = IOCB_CMD_PWRITEV;
-       }
  
         /* This matches the pread()/pwrite() logic */
         if (iocb->ki_pos < 0)
                 return -EINVAL;
  
-       if (opcode == IOCB_CMD_PWRITEV)
+       if (rw == WRITE)
                 file_start_write(file);
         do {
                 ret = rw_op(iocb, &iocb->ki_iovec[iocb->ki_cur_seg],
@@ -1336,9 +929,9 @@ static ssize_t aio_rw_vect_retry(struct kiocb *iocb)
         /* retry all partial writes.  retry partial reads as long as its a
          * regular file. */
         } while (ret > 0 && iocb->ki_left > 0 &&
-                (opcode == IOCB_CMD_PWRITEV ||
+                (rw == WRITE ||
                   (!S_ISFIFO(inode->i_mode) && !S_ISSOCK(inode->i_mode))));
-       if (opcode == IOCB_CMD_PWRITEV)
+       if (rw == WRITE)
                 file_end_write(file);
  
         /* This means we must have transferred all that we could */
@@ -1348,81 +941,49 @@ static ssize_t aio_rw_vect_retry(struct kiocb *iocb)
  
         /* If we managed to write some out we return that, rather than
          * the eventual error. */
-       if (opcode == IOCB_CMD_PWRITEV
-           && ret < 0 && ret != -EIOCBQUEUED && ret != -EIOCBRETRY
+       if (rw == WRITE
+           && ret < 0 && ret != -EIOCBQUEUED
             && iocb->ki_nbytes - iocb->ki_left)
                 ret = iocb->ki_nbytes - iocb->ki_left;
  
         return ret;
  }
  
-static ssize_t aio_fdsync(struct kiocb *iocb)
-{
-       struct file *file = iocb->ki_filp;
-       ssize_t ret = -EINVAL;
-
-       if (file->f_op->aio_fsync)
-               ret = file->f_op->aio_fsync(iocb, 1);
-       return ret;
-}
-
-static ssize_t aio_fsync(struct kiocb *iocb)
-{
-       struct file *file = iocb->ki_filp;
-       ssize_t ret = -EINVAL;
-
-       if (file->f_op->aio_fsync)
-               ret = file->f_op->aio_fsync(iocb, 0);
-       return ret;
-}
-
-static ssize_t aio_setup_vectored_rw(int type, struct kiocb *kiocb, bool compat)
+static ssize_t aio_setup_vectored_rw(int rw, struct kiocb *kiocb, bool compat)
  {
         ssize_t ret;
  
+       kiocb->ki_nr_segs = kiocb->ki_nbytes;
+
  #ifdef CONFIG_COMPAT
         if (compat)
-               ret = compat_rw_copy_check_uvector(type,
+               ret = compat_rw_copy_check_uvector(rw,
                                 (struct compat_iovec __user *)kiocb->ki_buf,
-                               kiocb->ki_nbytes, 1, &kiocb->ki_inline_vec,
+                               kiocb->ki_nr_segs, 1, &kiocb->ki_inline_vec,
                                 &kiocb->ki_iovec);
         else
  #endif
-               ret = rw_copy_check_uvector(type,
+               ret = rw_copy_check_uvector(rw,
                                 (struct iovec __user *)kiocb->ki_buf,
-                               kiocb->ki_nbytes, 1, &kiocb->ki_inline_vec,
+                               kiocb->ki_nr_segs, 1, &kiocb->ki_inline_vec,
                                 &kiocb->ki_iovec);
         if (ret < 0)
-               goto out;
-
-       ret = rw_verify_area(type, kiocb->ki_filp, &kiocb->ki_pos, ret);
-       if (ret < 0)
-               goto out;
+               return ret;
  
-       kiocb->ki_nr_segs = kiocb->ki_nbytes;
-       kiocb->ki_cur_seg = 0;
-       /* ki_nbytes/left now reflect bytes instead of segs */
+       /* ki_nbytes now reflect bytes instead of segs */
         kiocb->ki_nbytes = ret;
-       kiocb->ki_left = ret;
-
-       ret = 0;
-out:
-       return ret;
+       return 0;
  }
  
-static ssize_t aio_setup_single_vector(int type, struct file * file, struct kiocb *kiocb)
+static ssize_t aio_setup_single_vector(int rw, struct kiocb *kiocb)
  {
-       int bytes;
-
-       bytes = rw_verify_area(type, file, &kiocb->ki_pos, kiocb->ki_left);
-       if (bytes < 0)
-               return bytes;
+       if (unlikely(!access_ok(!rw, kiocb->ki_buf, kiocb->ki_nbytes)))
+               return -EFAULT;
  
         kiocb->ki_iovec = &kiocb->ki_inline_vec;
         kiocb->ki_iovec->iov_base = kiocb->ki_buf;
-       kiocb->ki_iovec->iov_len = bytes;
+       kiocb->ki_iovec->iov_len = kiocb->ki_nbytes;
         kiocb->ki_nr_segs = 1;
-       kiocb->ki_cur_seg = 0;
         return 0;
  }
  
@@ -1431,96 +992,95 @@ static ssize_t aio_setup_single_vector(int type, struct file * file, struct kioc
   *     Performs the initial checks and aio retry method
   *     setup for the kiocb at the time of io submission.
   */
-static ssize_t aio_setup_iocb(struct kiocb *kiocb, bool compat)
+static ssize_t aio_run_iocb(struct kiocb *req, bool compat)
  {
-       struct file *file = kiocb->ki_filp;
-       ssize_t ret = 0;
+       struct file *file = req->ki_filp;
+       ssize_t ret;
+       int rw;
+       fmode_t mode;
+       aio_rw_op *rw_op;
  
-       switch (kiocb->ki_opcode) {
+       switch (req->ki_opcode) {
         case IOCB_CMD_PREAD:
-               ret = -EBADF;
-               if (unlikely(!(file->f_mode & FMODE_READ)))
-                       break;
-               ret = -EFAULT;
-               if (unlikely(!access_ok(VERIFY_WRITE, kiocb->ki_buf,
-                       kiocb->ki_left)))
-                       break;
-               ret = aio_setup_single_vector(READ, file, kiocb);
-               if (ret)
-                       break;
-               ret = -EINVAL;
-               if (file->f_op->aio_read)
-                       kiocb->ki_retry = aio_rw_vect_retry;
-               break;
-       case IOCB_CMD_PWRITE:
-               ret = -EBADF;
-               if (unlikely(!(file->f_mode & FMODE_WRITE)))
-                       break;
-               ret = -EFAULT;
-               if (unlikely(!access_ok(VERIFY_READ, kiocb->ki_buf,
-                       kiocb->ki_left)))
-                       break;
-               ret = aio_setup_single_vector(WRITE, file, kiocb);
-               if (ret)
-                       break;
-               ret = -EINVAL;
-               if (file->f_op->aio_write)
-                       kiocb->ki_retry = aio_rw_vect_retry;
-               break;
         case IOCB_CMD_PREADV:
-               ret = -EBADF;
-               if (unlikely(!(file->f_mode & FMODE_READ)))
-                       break;
-               ret = aio_setup_vectored_rw(READ, kiocb, compat);
-               if (ret)
-                       break;
-               ret = -EINVAL;
-               if (file->f_op->aio_read)
-                       kiocb->ki_retry = aio_rw_vect_retry;
-               break;
+               mode    = FMODE_READ;
+               rw      = READ;
+               rw_op   = file->f_op->aio_read;
+               goto rw_common;
+
+       case IOCB_CMD_PWRITE:
         case IOCB_CMD_PWRITEV:
-               ret = -EBADF;
-               if (unlikely(!(file->f_mode & FMODE_WRITE)))
-                       break;
-               ret = aio_setup_vectored_rw(WRITE, kiocb, compat);
+               mode    = FMODE_WRITE;
+               rw      = WRITE;
+               rw_op   = file->f_op->aio_write;
+               goto rw_common;
+rw_common:
+               if (unlikely(!(file->f_mode & mode)))
+                       return -EBADF;
+
+               if (!rw_op)
+                       return -EINVAL;
+
+               ret = (req->ki_opcode == IOCB_CMD_PREADV ||
+                      req->ki_opcode == IOCB_CMD_PWRITEV)
+                       ? aio_setup_vectored_rw(rw, req, compat)
+                       : aio_setup_single_vector(rw, req);
                 if (ret)
-                       break;
-               ret = -EINVAL;
-               if (file->f_op->aio_write)
-                       kiocb->ki_retry = aio_rw_vect_retry;
+                       return ret;
+
+               ret = rw_verify_area(rw, file, &req->ki_pos, req->ki_nbytes);
+               if (ret < 0)
+                       return ret;
+
+               req->ki_nbytes = ret;
+               req->ki_left = ret;
+
+               ret = aio_rw_vect_retry(req, rw, rw_op);
                 break;
+
         case IOCB_CMD_FDSYNC:
-               ret = -EINVAL;
-               if (file->f_op->aio_fsync)
-                       kiocb->ki_retry = aio_fdsync;
+               if (!file->f_op->aio_fsync)
+                       return -EINVAL;
+
+               ret = file->f_op->aio_fsync(req, 1);
                 break;
+
         case IOCB_CMD_FSYNC:
-               ret = -EINVAL;
-               if (file->f_op->aio_fsync)
-                       kiocb->ki_retry = aio_fsync;
+               if (!file->f_op->aio_fsync)
+                       return -EINVAL;
+
+               ret = file->f_op->aio_fsync(req, 0);
                 break;
+
         default:
-               dprintk("EINVAL: io_submit: no operation provided\n");
-               ret = -EINVAL;
+               pr_debug("EINVAL: no operation provided\n");
+               return -EINVAL;
         }
  
-       if (!kiocb->ki_retry)
-               return ret;
+       if (ret != -EIOCBQUEUED) {
+               /*
+                * There's no easy way to restart the syscall since other AIO's
+                * may be already running. Just fail this IO with EINTR.
+                */
+               if (unlikely(ret == -ERESTARTSYS || ret == -ERESTARTNOINTR ||
+                            ret == -ERESTARTNOHAND ||
+                            ret == -ERESTART_RESTARTBLOCK))
+                       ret = -EINTR;
+               aio_complete(req, ret, 0);
+       }
  
         return 0;
  }
  
  static int io_submit_one(struct kioctx *ctx, struct iocb __user *user_iocb,
-                        struct iocb *iocb, struct kiocb_batch *batch,
-                        bool compat)
+                        struct iocb *iocb, bool compat)
  {
         struct kiocb *req;
-       struct file *file;
         ssize_t ret;
  
         /* enforce forwards compatibility on users */
         if (unlikely(iocb->aio_reserved1 || iocb->aio_reserved2)) {
-               pr_debug("EINVAL: io_submit: reserve field set\n");
+               pr_debug("EINVAL: reserve field set\n");
                 return -EINVAL;
         }
  
@@ -1534,16 +1094,16 @@ static int io_submit_one(struct kioctx *ctx, struct iocb __user *user_iocb,
                 return -EINVAL;
         }
  
-       file = fget(iocb->aio_fildes);
-       if (unlikely(!file))
-               return -EBADF;
-
-       req = aio_get_req(ctx, batch);  /* returns with 2 references to req */
-       if (unlikely(!req)) {
-               fput(file);
+       req = aio_get_req(ctx);
+       if (unlikely(!req))
                 return -EAGAIN;
+
+       req->ki_filp = fget(iocb->aio_fildes);
+       if (unlikely(!req->ki_filp)) {
+               ret = -EBADF;
+               goto out_put_req;
         }
-       req->ki_filp = file;
+
         if (iocb->aio_flags & IOCB_FLAG_RESFD) {
                 /*
                  * If the IOCB_FLAG_RESFD flag of aio_flags is set, get an
@@ -1559,9 +1119,9 @@ static int io_submit_one(struct kioctx *ctx, struct iocb __user *user_iocb,
                 }
         }
  
-       ret = put_user(req->ki_key, &user_iocb->aio_key);
+       ret = put_user(KIOCB_KEY, &user_iocb->aio_key);
         if (unlikely(ret)) {
-               dprintk("EFAULT: aio_key\n");
+               pr_debug("EFAULT: aio_key\n");
                 goto out_put_req;
         }
  
@@ -1573,41 +1133,14 @@ static int io_submit_one(struct kioctx *ctx, struct iocb __user *user_iocb,
         req->ki_left = req->ki_nbytes = iocb->aio_nbytes;
         req->ki_opcode = iocb->aio_lio_opcode;
  
-       ret = aio_setup_iocb(req, compat);
-
+       ret = aio_run_iocb(req, compat);
         if (ret)
                 goto out_put_req;
  
-       spin_lock_irq(&ctx->ctx_lock);
-       /*
-        * We could have raced with io_destroy() and are currently holding a
-        * reference to ctx which should be destroyed. We cannot submit IO
-        * since ctx gets freed as soon as io_submit() puts its reference.  The
-        * check here is reliable: io_destroy() sets ctx->dead before waiting
-        * for outstanding IO and the barrier between these two is realized by
-        * unlock of mm->ioctx_lock and lock of ctx->ctx_lock.  Analogously we
-        * increment ctx->reqs_active before checking for ctx->dead and the
-        * barrier is realized by unlock and lock of ctx->ctx_lock. Thus if we
-        * don't see ctx->dead set here, io_destroy() waits for our IO to
-        * finish.
-        */
-       if (ctx->dead) {
-               spin_unlock_irq(&ctx->ctx_lock);
-               ret = -EINVAL;
-               goto out_put_req;
-       }
-       aio_run_iocb(req);
-       if (!list_empty(&ctx->run_list)) {
-               /* drain the run list */
-               while (__aio_run_iocbs(ctx))
-                       ;
-       }
-       spin_unlock_irq(&ctx->ctx_lock);
-
         aio_put_req(req);       /* drop extra ref to req */
         return 0;
-
  out_put_req:
+       atomic_dec(&ctx->reqs_active);
         aio_put_req(req);       /* drop extra ref to req */
         aio_put_req(req);       /* drop i/o ref to req */
         return ret;
@@ -1620,7 +1153,6 @@ long do_io_submit(aio_context_t ctx_id, long nr,
         long ret = 0;
         int i = 0;
         struct blk_plug plug;
-       struct kiocb_batch batch;
  
         if (unlikely(nr < 0))
                 return -EINVAL;
@@ -1633,12 +1165,10 @@ long do_io_submit(aio_context_t ctx_id, long nr,
  
         ctx = lookup_ioctx(ctx_id);
         if (unlikely(!ctx)) {
-               pr_debug("EINVAL: io_submit: invalid context id\n");
+               pr_debug("EINVAL: invalid context id\n");
                 return -EINVAL;
         }
  
-       kiocb_batch_init(&batch, nr);
-
         blk_start_plug(&plug);
  
         /*
@@ -1659,13 +1189,12 @@ long do_io_submit(aio_context_t ctx_id, long nr,
                         break;
                 }
  
-               ret = io_submit_one(ctx, user_iocb, &tmp, &batch, compat);
+               ret = io_submit_one(ctx, user_iocb, &tmp, compat);
                 if (ret)
                         break;
         }
         blk_finish_plug(&plug);
  
-       kiocb_batch_free(ctx, &batch);
         put_ioctx(ctx);
         return i ? i : ret;
  }
@@ -1698,10 +1227,13 @@ static struct kiocb *lookup_kiocb(struct kioctx *ctx, struct iocb __user *iocb,
  
         assert_spin_locked(&ctx->ctx_lock);
  
+       if (key != KIOCB_KEY)
+               return NULL;
+
         /* TODO: use a hash or array, this sucks. */
         list_for_each(pos, &ctx->active_reqs) {
                 struct kiocb *kiocb = list_kiocb(pos);
-               if (kiocb->ki_obj.user == iocb && kiocb->ki_key == key)
+               if (kiocb->ki_obj.user == iocb)
                         return kiocb;
         }
         return NULL;
@@ -1720,7 +1252,7 @@ static struct kiocb *lookup_kiocb(struct kioctx *ctx, struct iocb __user *iocb,
  SYSCALL_DEFINE3(io_cancel, aio_context_t, ctx_id, struct iocb __user *, iocb,
                 struct io_event __user *, result)
  {
-       int (*cancel)(struct kiocb *iocb, struct io_event *res);
+       struct io_event res;
         struct kioctx *ctx;
         struct kiocb *kiocb;
         u32 key;
@@ -1735,32 +1267,22 @@ SYSCALL_DEFINE3(io_cancel, aio_context_t, ctx_id, struct iocb __user *, iocb,
                 return -EINVAL;
  
         spin_lock_irq(&ctx->ctx_lock);
-       ret = -EAGAIN;
+
         kiocb = lookup_kiocb(ctx, iocb, key);
-       if (kiocb && kiocb->ki_cancel) {
-               cancel = kiocb->ki_cancel;
-               kiocb->ki_users ++;
-               kiocbSetCancelled(kiocb);
-       } else
-               cancel = NULL;
+       if (kiocb)
+               ret = kiocb_cancel(ctx, kiocb, &res);
+       else
+               ret = -EINVAL;
+
         spin_unlock_irq(&ctx->ctx_lock);
  
-       if (NULL != cancel) {
-               struct io_event tmp;
-               pr_debug("calling cancel\n");
-               memset(&tmp, 0, sizeof(tmp));
-               tmp.obj = (u64)(unsigned long)kiocb->ki_obj.user;
-               tmp.data = kiocb->ki_user_data;
-               ret = cancel(kiocb, &tmp);
-               if (!ret) {
-                       /* Cancellation succeeded -- copy the result
-                        * into the user's buffer.
-                        */
-                       if (copy_to_user(result, &tmp, sizeof(tmp)))
-                               ret = -EFAULT;
-               }
-       } else
-               ret = -EINVAL;
+       if (!ret) {
+               /* Cancellation succeeded -- copy the result
+                * into the user's buffer.
+                */
+               if (copy_to_user(result, &res, sizeof(res)))
+                       ret = -EFAULT;
+       }
  
         put_ioctx(ctx);
  
diff --git a/fs/bio-integrity.c b/fs/bio-integrity.c

index a3f28f331b2bba7e6653da30dce92adba5d97140..8fb42916d8a29812e349fdbfa980ef1c34056ce4 100644 (file)
--- a/fs/bio-integrity.c
+++ b/fs/bio-integrity.c
@@ -27,48 +27,11 @@
  #include <linux/workqueue.h>
  #include <linux/slab.h>
  
-struct integrity_slab {
-       struct kmem_cache *slab;
-       unsigned short nr_vecs;
-       char name[8];
-};
-
-#define IS(x) { .nr_vecs = x, .name = "bip-"__stringify(x) }
-struct integrity_slab bip_slab[BIOVEC_NR_POOLS] __read_mostly = {
-       IS(1), IS(4), IS(16), IS(64), IS(128), IS(BIO_MAX_PAGES),
-};
-#undef IS
+#define BIP_INLINE_VECS        4
  
+static struct kmem_cache *bip_slab;
  static struct workqueue_struct *kintegrityd_wq;
  
-static inline unsigned int vecs_to_idx(unsigned int nr)
-{
-       switch (nr) {
-       case 1:
-               return 0;
-       case 2 ... 4:
-               return 1;
-       case 5 ... 16:
-               return 2;
-       case 17 ... 64:
-               return 3;
-       case 65 ... 128:
-               return 4;
-       case 129 ... BIO_MAX_PAGES:
-               return 5;
-       default:
-               BUG();
-       }
-}
-
-static inline int use_bip_pool(unsigned int idx)
-{
-       if (idx == BIOVEC_MAX_IDX)
-               return 1;
-
-       return 0;
-}
-
  /**
   * bio_integrity_alloc - Allocate integrity payload and attach it to bio
   * @bio:       bio to attach integrity metadata to
@@ -84,37 +47,41 @@ struct bio_integrity_payload *bio_integrity_alloc(struct bio *bio,
                                                   unsigned int nr_vecs)
  {
         struct bio_integrity_payload *bip;
-       unsigned int idx = vecs_to_idx(nr_vecs);
         struct bio_set *bs = bio->bi_pool;
-
-       if (!bs)
-               bs = fs_bio_set;
-
-       BUG_ON(bio == NULL);
-       bip = NULL;
-
-       /* Lower order allocations come straight from slab */
-       if (!use_bip_pool(idx))
-               bip = kmem_cache_alloc(bip_slab[idx].slab, gfp_mask);
-
-       /* Use mempool if lower order alloc failed or max vecs were requested */
-       if (bip == NULL) {
-               idx = BIOVEC_MAX_IDX;  /* so we free the payload properly later */
+       unsigned long idx = BIO_POOL_NONE;
+       unsigned inline_vecs;
+
+       if (!bs) {
+               bip = kmalloc(sizeof(struct bio_integrity_payload) +
+                             sizeof(struct bio_vec) * nr_vecs, gfp_mask);
+               inline_vecs = nr_vecs;
+       } else {
                 bip = mempool_alloc(bs->bio_integrity_pool, gfp_mask);
-
-               if (unlikely(bip == NULL)) {
-                       printk(KERN_ERR "%s: could not alloc bip\n", __func__);
-                       return NULL;
-               }
+               inline_vecs = BIP_INLINE_VECS;
         }
  
+       if (unlikely(!bip))
+               return NULL;
+
         memset(bip, 0, sizeof(*bip));
  
+       if (nr_vecs > inline_vecs) {
+               bip->bip_vec = bvec_alloc(gfp_mask, nr_vecs, &idx,
+                                         bs->bvec_integrity_pool);
+               if (!bip->bip_vec)
+                       goto err;
+       } else {
+               bip->bip_vec = bip->bip_inline_vecs;
+       }
+
         bip->bip_slab = idx;
         bip->bip_bio = bio;
         bio->bi_integrity = bip;
  
         return bip;
+err:
+       mempool_free(bip, bs->bio_integrity_pool);
+       return NULL;
  }
  EXPORT_SYMBOL(bio_integrity_alloc);
  
@@ -130,20 +97,18 @@ void bio_integrity_free(struct bio *bio)
         struct bio_integrity_payload *bip = bio->bi_integrity;
         struct bio_set *bs = bio->bi_pool;
  
-       if (!bs)
-               bs = fs_bio_set;
-
-       BUG_ON(bip == NULL);
-
-       /* A cloned bio doesn't own the integrity metadata */
-       if (!bio_flagged(bio, BIO_CLONED) && !bio_flagged(bio, BIO_FS_INTEGRITY)
-           && bip->bip_buf != NULL)
+       if (bip->bip_owns_buf)
                 kfree(bip->bip_buf);
  
-       if (use_bip_pool(bip->bip_slab))
+       if (bs) {
+               if (bip->bip_slab != BIO_POOL_NONE)
+                       bvec_free(bs->bvec_integrity_pool, bip->bip_vec,
+                                 bip->bip_slab);
+
                 mempool_free(bip, bs->bio_integrity_pool);
-       else
-               kmem_cache_free(bip_slab[bip->bip_slab].slab, bip);
+       } else {
+               kfree(bip);
+       }
  
         bio->bi_integrity = NULL;
  }
@@ -419,6 +384,7 @@ int bio_integrity_prep(struct bio *bio)
                 return -EIO;
         }
  
+       bip->bip_owns_buf = 1;
         bip->bip_buf = buf;
         bip->bip_size = len;
         bip->bip_sector = bio->bi_sector;
@@ -694,11 +660,11 @@ void bio_integrity_split(struct bio *bio, struct bio_pair *bp, int sectors)
         bp->bio1.bi_integrity = &bp->bip1;
         bp->bio2.bi_integrity = &bp->bip2;
  
-       bp->iv1 = bip->bip_vec[0];
-       bp->iv2 = bip->bip_vec[0];
+       bp->iv1 = bip->bip_vec[bip->bip_idx];
+       bp->iv2 = bip->bip_vec[bip->bip_idx];
  
-       bp->bip1.bip_vec[0] = bp->iv1;
-       bp->bip2.bip_vec[0] = bp->iv2;
+       bp->bip1.bip_vec = &bp->iv1;
+       bp->bip2.bip_vec = &bp->iv2;
  
         bp->iv1.bv_len = sectors * bi->tuple_size;
         bp->iv2.bv_offset += sectors * bi->tuple_size;
@@ -746,13 +712,14 @@ EXPORT_SYMBOL(bio_integrity_clone);
  
  int bioset_integrity_create(struct bio_set *bs, int pool_size)
  {
-       unsigned int max_slab = vecs_to_idx(BIO_MAX_PAGES);
-
         if (bs->bio_integrity_pool)
                 return 0;
  
-       bs->bio_integrity_pool =
-               mempool_create_slab_pool(pool_size, bip_slab[max_slab].slab);
+       bs->bio_integrity_pool = mempool_create_slab_pool(pool_size, bip_slab);
+
+       bs->bvec_integrity_pool = biovec_create_pool(bs, pool_size);
+       if (!bs->bvec_integrity_pool)
+               return -1;
  
         if (!bs->bio_integrity_pool)
                 return -1;
@@ -765,13 +732,14 @@ void bioset_integrity_free(struct bio_set *bs)
  {
         if (bs->bio_integrity_pool)
                 mempool_destroy(bs->bio_integrity_pool);
+
+       if (bs->bvec_integrity_pool)
+               mempool_destroy(bs->bio_integrity_pool);
  }
  EXPORT_SYMBOL(bioset_integrity_free);
  
  void __init bio_integrity_init(void)
  {
-       unsigned int i;
-
         /*
          * kintegrityd won't block much but may burn a lot of CPU cycles.
          * Make it highpri CPU intensive wq with max concurrency of 1.
@@ -781,14 +749,10 @@ void __init bio_integrity_init(void)
         if (!kintegrityd_wq)
                 panic("Failed to create kintegrityd\n");
  
-       for (i = 0 ; i < BIOVEC_NR_POOLS ; i++) {
-               unsigned int size;
-
-               size = sizeof(struct bio_integrity_payload)
-                       + bip_slab[i].nr_vecs * sizeof(struct bio_vec);
-
-               bip_slab[i].slab =
-                       kmem_cache_create(bip_slab[i].name, size, 0,
-                                         SLAB_HWCACHE_ALIGN|SLAB_PANIC, NULL);
-       }
+       bip_slab = kmem_cache_create("bio_integrity_payload",
+                                    sizeof(struct bio_integrity_payload) +
+                                    sizeof(struct bio_vec) * BIP_INLINE_VECS,
+                                    0, SLAB_HWCACHE_ALIGN|SLAB_PANIC, NULL);
+       if (!bip_slab)
+               panic("Failed to create slab\n");
  }
diff --git a/fs/bio.c b/fs/bio.c

index b96fc6ce485595f0179bc909c807ae197258e671..94bbc04dba77053bb47d3d8b793a3a8218f0a0d2 100644 (file)
--- a/fs/bio.c
+++ b/fs/bio.c
@@ -19,6 +19,7 @@
  #include <linux/swap.h>
  #include <linux/bio.h>
  #include <linux/blkdev.h>
+#include <linux/uio.h>
  #include <linux/iocontext.h>
  #include <linux/slab.h>
  #include <linux/init.h>
@@ -160,12 +161,12 @@ unsigned int bvec_nr_vecs(unsigned short idx)
         return bvec_slabs[idx].nr_vecs;
  }
  
-void bvec_free_bs(struct bio_set *bs, struct bio_vec *bv, unsigned int idx)
+void bvec_free(mempool_t *pool, struct bio_vec *bv, unsigned int idx)
  {
         BIO_BUG_ON(idx >= BIOVEC_NR_POOLS);
  
         if (idx == BIOVEC_MAX_IDX)
-               mempool_free(bv, bs->bvec_pool);
+               mempool_free(bv, pool);
         else {
                 struct biovec_slab *bvs = bvec_slabs + idx;
  
@@ -173,8 +174,8 @@ void bvec_free_bs(struct bio_set *bs, struct bio_vec *bv, unsigned int idx)
         }
  }
  
-struct bio_vec *bvec_alloc_bs(gfp_t gfp_mask, int nr, unsigned long *idx,
-                             struct bio_set *bs)
+struct bio_vec *bvec_alloc(gfp_t gfp_mask, int nr, unsigned long *idx,
+                          mempool_t *pool)
  {
         struct bio_vec *bvl;
  
@@ -210,7 +211,7 @@ struct bio_vec *bvec_alloc_bs(gfp_t gfp_mask, int nr, unsigned long *idx,
          */
         if (*idx == BIOVEC_MAX_IDX) {
  fallback:
-               bvl = mempool_alloc(bs->bvec_pool, gfp_mask);
+               bvl = mempool_alloc(pool, gfp_mask);
         } else {
                 struct biovec_slab *bvs = bvec_slabs + *idx;
                 gfp_t __gfp_mask = gfp_mask & ~(__GFP_WAIT | __GFP_IO);
@@ -252,8 +253,8 @@ static void bio_free(struct bio *bio)
         __bio_free(bio);
  
         if (bs) {
-               if (bio_has_allocated_vec(bio))
-                       bvec_free_bs(bs, bio->bi_io_vec, BIO_POOL_IDX(bio));
+               if (bio_flagged(bio, BIO_OWNS_VEC))
+                       bvec_free(bs->bvec_pool, bio->bi_io_vec, BIO_POOL_IDX(bio));
  
                 /*
                  * If we have front padding, adjust the bio pointer before freeing
@@ -297,6 +298,54 @@ void bio_reset(struct bio *bio)
  }
  EXPORT_SYMBOL(bio_reset);
  
+static void bio_alloc_rescue(struct work_struct *work)
+{
+       struct bio_set *bs = container_of(work, struct bio_set, rescue_work);
+       struct bio *bio;
+
+       while (1) {
+               spin_lock(&bs->rescue_lock);
+               bio = bio_list_pop(&bs->rescue_list);
+               spin_unlock(&bs->rescue_lock);
+
+               if (!bio)
+                       break;
+
+               generic_make_request(bio);
+       }
+}
+
+static void punt_bios_to_rescuer(struct bio_set *bs)
+{
+       struct bio_list punt, nopunt;
+       struct bio *bio;
+
+       /*
+        * In order to guarantee forward progress we must punt only bios that
+        * were allocated from this bio_set; otherwise, if there was a bio on
+        * there for a stacking driver higher up in the stack, processing it
+        * could require allocating bios from this bio_set, and doing that from
+        * our own rescuer would be bad.
+        *
+        * Since bio lists are singly linked, pop them all instead of trying to
+        * remove from the middle of the list:
+        */
+
+       bio_list_init(&punt);
+       bio_list_init(&nopunt);
+
+       while ((bio = bio_list_pop(current->bio_list)))
+               bio_list_add(bio->bi_pool == bs ? &punt : &nopunt, bio);
+
+       *current->bio_list = nopunt;
+
+       spin_lock(&bs->rescue_lock);
+       bio_list_merge(&bs->rescue_list, &punt);
+       spin_unlock(&bs->rescue_lock);
+
+       queue_work(bs->rescue_workqueue, &bs->rescue_work);
+}
+
  /**
   * bio_alloc_bioset - allocate a bio for I/O
   * @gfp_mask:   the GFP_ mask given to the slab allocator
@@ -314,11 +363,27 @@ EXPORT_SYMBOL(bio_reset);
   *   previously allocated bio for IO before attempting to allocate a new one.
   *   Failure to do so can cause deadlocks under memory pressure.
   *
+ *   Note that when running under generic_make_request() (i.e. any block
+ *   driver), bios are not submitted until after you return - see the code in
+ *   generic_make_request() that converts recursion into iteration, to prevent
+ *   stack overflows.
+ *
+ *   This would normally mean allocating multiple bios under
+ *   generic_make_request() would be susceptible to deadlocks, but we have
+ *   deadlock avoidance code that resubmits any blocked bios from a rescuer
+ *   thread.
+ *
+ *   However, we do not guarantee forward progress for allocations from other
+ *   mempools. Doing multiple allocations from the same mempool under
+ *   generic_make_request() should be avoided - instead, use bio_set's front_pad
+ *   for per bio allocations.
+ *
   *   RETURNS:
   *   Pointer to new bio on success, NULL on failure.
   */
  struct bio *bio_alloc_bioset(gfp_t gfp_mask, int nr_iovecs, struct bio_set *bs)
  {
+       gfp_t saved_gfp = gfp_mask;
         unsigned front_pad;
         unsigned inline_vecs;
         unsigned long idx = BIO_POOL_NONE;
@@ -336,7 +401,37 @@ struct bio *bio_alloc_bioset(gfp_t gfp_mask, int nr_iovecs, struct bio_set *bs)
                 front_pad = 0;
                 inline_vecs = nr_iovecs;
         } else {
+               /*
+                * generic_make_request() converts recursion to iteration; this
+                * means if we're running beneath it, any bios we allocate and
+                * submit will not be submitted (and thus freed) until after we
+                * return.
+                *
+                * This exposes us to a potential deadlock if we allocate
+                * multiple bios from the same bio_set() while running
+                * underneath generic_make_request(). If we were to allocate
+                * multiple bios (say a stacking block driver that was splitting
+                * bios), we would deadlock if we exhausted the mempool's
+                * reserve.
+                *
+                * We solve this, and guarantee forward progress, with a rescuer
+                * workqueue per bio_set. If we go to allocate and there are
+                * bios on current->bio_list, we first try the allocation
+                * without __GFP_WAIT; if that fails, we punt those bios we
+                * would be blocking to the rescuer workqueue before we retry
+                * with the original gfp_flags.
+                */
+
+               if (current->bio_list && !bio_list_empty(current->bio_list))
+                       gfp_mask &= ~__GFP_WAIT;
+
                 p = mempool_alloc(bs->bio_pool, gfp_mask);
+               if (!p && gfp_mask != saved_gfp) {
+                       punt_bios_to_rescuer(bs);
+                       gfp_mask = saved_gfp;
+                       p = mempool_alloc(bs->bio_pool, gfp_mask);
+               }
+
                 front_pad = bs->front_pad;
                 inline_vecs = BIO_INLINE_VECS;
         }
@@ -348,9 +443,17 @@ struct bio *bio_alloc_bioset(gfp_t gfp_mask, int nr_iovecs, struct bio_set *bs)
         bio_init(bio);
  
         if (nr_iovecs > inline_vecs) {
-               bvl = bvec_alloc_bs(gfp_mask, nr_iovecs, &idx, bs);
+               bvl = bvec_alloc(gfp_mask, nr_iovecs, &idx, bs->bvec_pool);
+               if (!bvl && gfp_mask != saved_gfp) {
+                       punt_bios_to_rescuer(bs);
+                       gfp_mask = saved_gfp;
+                       bvl = bvec_alloc(gfp_mask, nr_iovecs, &idx, bs->bvec_pool);
+               }
+
                 if (unlikely(!bvl))
                         goto err_free;
+
+               bio->bi_flags |= 1 << BIO_OWNS_VEC;
         } else if (nr_iovecs) {
                 bvl = bio->bi_inline_vecs;
         }
@@ -652,6 +755,181 @@ int bio_add_page(struct bio *bio, struct page *page, unsigned int len,
  }
  EXPORT_SYMBOL(bio_add_page);
  
+struct submit_bio_ret {
+       struct completion event;
+       int error;
+};
+
+static void submit_bio_wait_endio(struct bio *bio, int error)
+{
+       struct submit_bio_ret *ret = bio->bi_private;
+
+       ret->error = error;
+       complete(&ret->event);
+}
+
+/**
+ * submit_bio_wait - submit a bio, and wait until it completes
+ * @rw: whether to %READ or %WRITE, or maybe to %READA (read ahead)
+ * @bio: The &struct bio which describes the I/O
+ *
+ * Simple wrapper around submit_bio(). Returns 0 on success, or the error from
+ * bio_endio() on failure.
+ */
+int submit_bio_wait(int rw, struct bio *bio)
+{
+       struct submit_bio_ret ret;
+
+       rw |= REQ_SYNC;
+       init_completion(&ret.event);
+       bio->bi_private = &ret;
+       bio->bi_end_io = submit_bio_wait_endio;
+       submit_bio(rw, bio);
+       wait_for_completion(&ret.event);
+
+       return ret.error;
+}
+EXPORT_SYMBOL(submit_bio_wait);
+
+/**
+ * bio_advance - increment/complete a bio by some number of bytes
+ * @bio:       bio to advance
+ * @bytes:     number of bytes to complete
+ *
+ * This updates bi_sector, bi_size and bi_idx; if the number of bytes to
+ * complete doesn't align with a bvec boundary, then bv_len and bv_offset will
+ * be updated on the last bvec as well.
+ *
+ * @bio will then represent the remaining, uncompleted portion of the io.
+ */
+void bio_advance(struct bio *bio, unsigned bytes)
+{
+       if (bio_integrity(bio))
+               bio_integrity_advance(bio, bytes);
+
+       bio->bi_sector += bytes >> 9;
+       bio->bi_size -= bytes;
+
+       if (bio->bi_rw & BIO_NO_ADVANCE_ITER_MASK)
+               return;
+
+       while (bytes) {
+               if (unlikely(bio->bi_idx >= bio->bi_vcnt)) {
+                       WARN_ONCE(1, "bio idx %d >= vcnt %d\n",
+                                 bio->bi_idx, bio->bi_vcnt);
+                       break;
+               }
+
+               if (bytes >= bio_iovec(bio)->bv_len) {
+                       bytes -= bio_iovec(bio)->bv_len;
+                       bio->bi_idx++;
+               } else {
+                       bio_iovec(bio)->bv_len -= bytes;
+                       bio_iovec(bio)->bv_offset += bytes;
+                       bytes = 0;
+               }
+       }
+}
+EXPORT_SYMBOL(bio_advance);
+
+/**
+ * bio_alloc_pages - allocates a single page for each bvec in a bio
+ * @bio: bio to allocate pages for
+ * @gfp_mask: flags for allocation
+ *
+ * Allocates pages up to @bio->bi_vcnt.
+ *
+ * Returns 0 on success, -ENOMEM on failure. On failure, any allocated pages are
+ * freed.
+ */
+int bio_alloc_pages(struct bio *bio, gfp_t gfp_mask)
+{
+       int i;
+       struct bio_vec *bv;
+
+       bio_for_each_segment_all(bv, bio, i) {
+               bv->bv_page = alloc_page(gfp_mask);
+               if (!bv->bv_page) {
+                       while (--bv >= bio->bi_io_vec)
+                               __free_page(bv->bv_page);
+                       return -ENOMEM;
+               }
+       }
+
+       return 0;
+}
+EXPORT_SYMBOL(bio_alloc_pages);
+
+/**
+ * bio_copy_data - copy contents of data buffers from one chain of bios to
+ * another
+ * @src: source bio list
+ * @dst: destination bio list
+ *
+ * If @src and @dst are single bios, bi_next must be NULL - otherwise, treats
+ * @src and @dst as linked lists of bios.
+ *
+ * Stops when it reaches the end of either @src or @dst - that is, copies
+ * min(src->bi_size, dst->bi_size) bytes (or the equivalent for lists of bios).
+ */
+void bio_copy_data(struct bio *dst, struct bio *src)
+{
+       struct bio_vec *src_bv, *dst_bv;
+       unsigned src_offset, dst_offset, bytes;
+       void *src_p, *dst_p;
+
+       src_bv = bio_iovec(src);
+       dst_bv = bio_iovec(dst);
+
+       src_offset = src_bv->bv_offset;
+       dst_offset = dst_bv->bv_offset;
+
+       while (1) {
+               if (src_offset == src_bv->bv_offset + src_bv->bv_len) {
+                       src_bv++;
+                       if (src_bv == bio_iovec_idx(src, src->bi_vcnt)) {
+                               src = src->bi_next;
+                               if (!src)
+                                       break;
+
+                               src_bv = bio_iovec(src);
+                       }
+
+                       src_offset = src_bv->bv_offset;
+               }
+
+               if (dst_offset == dst_bv->bv_offset + dst_bv->bv_len) {
+                       dst_bv++;
+                       if (dst_bv == bio_iovec_idx(dst, dst->bi_vcnt)) {
+                               dst = dst->bi_next;
+                               if (!dst)
+                                       break;
+
+                               dst_bv = bio_iovec(dst);
+                       }
+
+                       dst_offset = dst_bv->bv_offset;
+               }
+
+               bytes = min(dst_bv->bv_offset + dst_bv->bv_len - dst_offset,
+                           src_bv->bv_offset + src_bv->bv_len - src_offset);
+
+               src_p = kmap_atomic(src_bv->bv_page);
+               dst_p = kmap_atomic(dst_bv->bv_page);
+
+               memcpy(dst_p + dst_bv->bv_offset,
+                      src_p + src_bv->bv_offset,
+                      bytes);
+
+               kunmap_atomic(dst_p);
+               kunmap_atomic(src_p);
+
+               src_offset += bytes;
+               dst_offset += bytes;
+       }
+}
+EXPORT_SYMBOL(bio_copy_data);
+
  struct bio_map_data {
         struct bio_vec *iovecs;
         struct sg_iovec *sgvecs;
@@ -714,7 +992,7 @@ static int __bio_copy_iov(struct bio *bio, struct bio_vec *iovecs,
         int iov_idx = 0;
         unsigned int iov_off = 0;
  
-       __bio_for_each_segment(bvec, bio, i, 0) {
+       bio_for_each_segment_all(bvec, bio, i) {
                 char *bv_addr = page_address(bvec->bv_page);
                 unsigned int bv_len = iovecs[i].bv_len;
  
@@ -896,7 +1174,7 @@ struct bio *bio_copy_user_iov(struct request_queue *q,
         return bio;
  cleanup:
         if (!map_data)
-               bio_for_each_segment(bvec, bio, i)
+               bio_for_each_segment_all(bvec, bio, i)
                         __free_page(bvec->bv_page);
  
         bio_put(bio);
@@ -1110,7 +1388,7 @@ static void __bio_unmap_user(struct bio *bio)
         /*
          * make sure we dirty pages we wrote to
          */
-       __bio_for_each_segment(bvec, bio, i, 0) {
+       bio_for_each_segment_all(bvec, bio, i) {
                 if (bio_data_dir(bio) == READ)
                         set_page_dirty_lock(bvec->bv_page);
  
@@ -1216,7 +1494,7 @@ static void bio_copy_kern_endio(struct bio *bio, int err)
         int i;
         char *p = bmd->sgvecs[0].iov_base;
  
-       __bio_for_each_segment(bvec, bio, i, 0) {
+       bio_for_each_segment_all(bvec, bio, i) {
                 char *addr = page_address(bvec->bv_page);
                 int len = bmd->iovecs[i].bv_len;
  
@@ -1256,7 +1534,7 @@ struct bio *bio_copy_kern(struct request_queue *q, void *data, unsigned int len,
         if (!reading) {
                 void *p = data;
  
-               bio_for_each_segment(bvec, bio, i) {
+               bio_for_each_segment_all(bvec, bio, i) {
                         char *addr = page_address(bvec->bv_page);
  
                         memcpy(addr, p, bvec->bv_len);
@@ -1301,11 +1579,11 @@ EXPORT_SYMBOL(bio_copy_kern);
   */
  void bio_set_pages_dirty(struct bio *bio)
  {
-       struct bio_vec *bvec = bio->bi_io_vec;
+       struct bio_vec *bvec;
         int i;
  
-       for (i = 0; i < bio->bi_vcnt; i++) {
-               struct page *page = bvec[i].bv_page;
+       bio_for_each_segment_all(bvec, bio, i) {
+               struct page *page = bvec->bv_page;
  
                 if (page && !PageCompound(page))
                         set_page_dirty_lock(page);
@@ -1314,11 +1592,11 @@ void bio_set_pages_dirty(struct bio *bio)
  
  static void bio_release_pages(struct bio *bio)
  {
-       struct bio_vec *bvec = bio->bi_io_vec;
+       struct bio_vec *bvec;
         int i;
  
-       for (i = 0; i < bio->bi_vcnt; i++) {
-               struct page *page = bvec[i].bv_page;
+       bio_for_each_segment_all(bvec, bio, i) {
+               struct page *page = bvec->bv_page;
  
                 if (page)
                         put_page(page);
@@ -1367,16 +1645,16 @@ static void bio_dirty_fn(struct work_struct *work)
  
  void bio_check_pages_dirty(struct bio *bio)
  {
-       struct bio_vec *bvec = bio->bi_io_vec;
+       struct bio_vec *bvec;
         int nr_clean_pages = 0;
         int i;
  
-       for (i = 0; i < bio->bi_vcnt; i++) {
-               struct page *page = bvec[i].bv_page;
+       bio_for_each_segment_all(bvec, bio, i) {
+               struct page *page = bvec->bv_page;
  
                 if (PageDirty(page) || PageCompound(page)) {
                         page_cache_release(page);
-                       bvec[i].bv_page = NULL;
+                       bvec->bv_page = NULL;
                 } else {
                         nr_clean_pages++;
                 }
@@ -1477,8 +1755,7 @@ struct bio_pair *bio_split(struct bio *bi, int first_sectors)
         trace_block_split(bdev_get_queue(bi->bi_bdev), bi,
                                 bi->bi_sector + first_sectors);
  
-       BUG_ON(bi->bi_vcnt != 1 && bi->bi_vcnt != 0);
-       BUG_ON(bi->bi_idx != 0);
+       BUG_ON(bio_segments(bi) > 1);
         atomic_set(&bp->cnt, 3);
         bp->error = 0;
         bp->bio1 = *bi;
@@ -1488,8 +1765,8 @@ struct bio_pair *bio_split(struct bio *bi, int first_sectors)
         bp->bio1.bi_size = first_sectors << 9;
  
         if (bi->bi_vcnt != 0) {
-               bp->bv1 = bi->bi_io_vec[0];
-               bp->bv2 = bi->bi_io_vec[0];
+               bp->bv1 = *bio_iovec(bi);
+               bp->bv2 = *bio_iovec(bi);
  
                 if (bio_is_rw(bi)) {
                         bp->bv2.bv_offset += first_sectors << 9;
@@ -1541,7 +1818,7 @@ sector_t bio_sector_offset(struct bio *bio, unsigned short index,
         if (index >= bio->bi_idx)
                 index = bio->bi_vcnt - 1;
  
-       __bio_for_each_segment(bv, bio, i, 0) {
+       bio_for_each_segment_all(bv, bio, i) {
                 if (i == index) {
                         if (offset > bv->bv_offset)
                                 sectors += (offset - bv->bv_offset) / sector_sz;
@@ -1559,29 +1836,25 @@ EXPORT_SYMBOL(bio_sector_offset);
   * create memory pools for biovec's in a bio_set.
   * use the global biovec slabs created for general use.
   */
-static int biovec_create_pools(struct bio_set *bs, int pool_entries)
+mempool_t *biovec_create_pool(struct bio_set *bs, int pool_entries)
  {
         struct biovec_slab *bp = bvec_slabs + BIOVEC_MAX_IDX;
  
-       bs->bvec_pool = mempool_create_slab_pool(pool_entries, bp->slab);
-       if (!bs->bvec_pool)
-               return -ENOMEM;
-
-       return 0;
-}
-
-static void biovec_free_pools(struct bio_set *bs)
-{
-       mempool_destroy(bs->bvec_pool);
+       return mempool_create_slab_pool(pool_entries, bp->slab);
  }
  
  void bioset_free(struct bio_set *bs)
  {
+       if (bs->rescue_workqueue)
+               destroy_workqueue(bs->rescue_workqueue);
+
         if (bs->bio_pool)
                 mempool_destroy(bs->bio_pool);
  
+       if (bs->bvec_pool)
+               mempool_destroy(bs->bvec_pool);
+
         bioset_integrity_free(bs);
-       biovec_free_pools(bs);
         bio_put_slab(bs);
  
         kfree(bs);
@@ -1612,6 +1885,10 @@ struct bio_set *bioset_create(unsigned int pool_size, unsigned int front_pad)
  
         bs->front_pad = front_pad;
  
+       spin_lock_init(&bs->rescue_lock);
+       bio_list_init(&bs->rescue_list);
+       INIT_WORK(&bs->rescue_work, bio_alloc_rescue);
+
         bs->bio_slab = bio_find_or_create_slab(front_pad + back_pad);
         if (!bs->bio_slab) {
                 kfree(bs);
@@ -1622,9 +1899,15 @@ struct bio_set *bioset_create(unsigned int pool_size, unsigned int front_pad)
         if (!bs->bio_pool)
                 goto bad;
  
-       if (!biovec_create_pools(bs, pool_size))
-               return bs;
+       bs->bvec_pool = biovec_create_pool(bs, pool_size);
+       if (!bs->bvec_pool)
+               goto bad;
+
+       bs->rescue_workqueue = alloc_workqueue("bioset", WQ_MEM_RECLAIM, 0);
+       if (!bs->rescue_workqueue)
+               goto bad;
  
+       return bs;
  bad:
         bioset_free(bs);
         return NULL;
diff --git a/fs/block_dev.c b/fs/block_dev.c

index ce08de7467a337231da89a19958a27ff6a2e8b4c..2091db8cdd783a2287ce9165a7223cfe59cf5a2b 100644 (file)
--- a/fs/block_dev.c
+++ b/fs/block_dev.c
@@ -27,6 +27,7 @@
  #include <linux/namei.h>
  #include <linux/log2.h>
  #include <linux/cleancache.h>
+#include <linux/aio.h>
  #include <asm/uaccess.h>
  #include "internal.h"
  
@@ -1045,7 +1046,7 @@ void bd_set_size(struct block_device *bdev, loff_t size)
  }
  EXPORT_SYMBOL(bd_set_size);
  
-static int __blkdev_put(struct block_device *bdev, fmode_t mode, int for_part);
+static void __blkdev_put(struct block_device *bdev, fmode_t mode, int for_part);
  
  /*
   * bd_mutex locking:
@@ -1400,9 +1401,8 @@ static int blkdev_open(struct inode * inode, struct file * filp)
         return blkdev_get(bdev, filp->f_mode, filp);
  }
  
-static int __blkdev_put(struct block_device *bdev, fmode_t mode, int for_part)
+static void __blkdev_put(struct block_device *bdev, fmode_t mode, int for_part)
  {
-       int ret = 0;
         struct gendisk *disk = bdev->bd_disk;
         struct block_device *victim = NULL;
  
@@ -1422,7 +1422,7 @@ static int __blkdev_put(struct block_device *bdev, fmode_t mode, int for_part)
         }
         if (bdev->bd_contains == bdev) {
                 if (disk->fops->release)
-                       ret = disk->fops->release(disk, mode);
+                       disk->fops->release(disk, mode);
         }
         if (!bdev->bd_openers) {
                 struct module *owner = disk->fops->owner;
@@ -1441,10 +1441,9 @@ static int __blkdev_put(struct block_device *bdev, fmode_t mode, int for_part)
         bdput(bdev);
         if (victim)
                 __blkdev_put(victim, mode, 1);
-       return ret;
  }
  
-int blkdev_put(struct block_device *bdev, fmode_t mode)
+void blkdev_put(struct block_device *bdev, fmode_t mode)
  {
         mutex_lock(&bdev->bd_mutex);
  
@@ -1488,15 +1487,15 @@ int blkdev_put(struct block_device *bdev, fmode_t mode)
  
         mutex_unlock(&bdev->bd_mutex);
  
-       return __blkdev_put(bdev, mode, 0);
+       __blkdev_put(bdev, mode, 0);
  }
  EXPORT_SYMBOL(blkdev_put);
  
  static int blkdev_close(struct inode * inode, struct file * filp)
  {
         struct block_device *bdev = I_BDEV(filp->f_mapping->host);
-
-       return blkdev_put(bdev, filp->f_mode);
+       blkdev_put(bdev, filp->f_mode);
+       return 0;
  }
  
  static long block_ioctl(struct file *file, unsigned cmd, unsigned long arg)
@@ -1557,7 +1556,7 @@ static ssize_t blkdev_aio_read(struct kiocb *iocb, const struct iovec *iov,
                 return 0;
  
         size -= pos;
-       if (size < INT_MAX)
+       if (size < iocb->ki_left)
                 nr_segs = iov_shorten((struct iovec *)iov, nr_segs, size);
         return generic_file_aio_read(iocb, iov, nr_segs, pos);
  }
diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c

index cdee391fc7bfd57c596204a8474142e7afe60335..73f2bfe3ac9302091608beae85b4aecf28622240 100644 (file)
--- a/fs/btrfs/extent_io.c
+++ b/fs/btrfs/extent_io.c
@@ -2560,8 +2560,7 @@ static int submit_extent_page(int rw, struct extent_io_tree *tree,
                 if (old_compressed)
                         contig = bio->bi_sector == sector;
                 else
-                       contig = bio->bi_sector + (bio->bi_size >> 9) ==
-                               sector;
+                       contig = bio_end_sector(bio) == sector;
  
                 if (prev_bio_flags != bio_flags || !contig ||
                     merge_bio(rw, tree, page, offset, page_size, bio, bio_flags) ||
diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c

index bb8b7a0e28a6b80bd5907f02233ec9c831723912..bc4d54c465a04dde6fe40b8ad4afc2275e82b587 100644 (file)
--- a/fs/btrfs/file.c
+++ b/fs/btrfs/file.c
@@ -24,6 +24,7 @@
  #include <linux/string.h>
  #include <linux/backing-dev.h>
  #include <linux/mpage.h>
+#include <linux/aio.h>
  #include <linux/falloc.h>
  #include <linux/swap.h>
  #include <linux/writeback.h>
diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c

index 09c58a35b429d6a82dc261014df2875e7c15f920..898da0a01e040b2bb263f34275ec8afa50265d16 100644 (file)
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -32,6 +32,7 @@
  #include <linux/writeback.h>
  #include <linux/statfs.h>
  #include <linux/compat.h>
+#include <linux/aio.h>
  #include <linux/bit_spinlock.h>
  #include <linux/xattr.h>
  #include <linux/posix_acl.h>
diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c

index 2854c824ab6443d04eac19cf0b540f79c257cfa0..6789772265707bdd02d97f91dc92e8b34a2fd6bb 100644 (file)
--- a/fs/btrfs/volumes.c
+++ b/fs/btrfs/volumes.c
@@ -5177,7 +5177,7 @@ static int bio_size_ok(struct block_device *bdev, struct bio *bio,
         }
  
         prev = &bio->bi_io_vec[bio->bi_vcnt - 1];
-       if ((bio->bi_size >> 9) > max_sectors)
+       if (bio_sectors(bio) > max_sectors)
                 return 0;
  
         if (!q->merge_bvec_fn)
diff --git a/fs/buffer.c b/fs/buffer.c

index bc1fe14aaa3e4583aa351298fc9faa4f42d2d6a9..d2a4d1bb2d57aec3999e494d52c4f765a0ae48e8 100644 (file)
--- a/fs/buffer.c
+++ b/fs/buffer.c
@@ -2977,7 +2977,6 @@ int _submit_bh(int rw, struct buffer_head *bh, unsigned long bio_flags)
         bio->bi_io_vec[0].bv_offset = bh_offset(bh);
  
         bio->bi_vcnt = 1;
-       bio->bi_idx = 0;
         bio->bi_size = bh->b_size;
  
         bio->bi_end_io = end_bio_bh_io_sync;
diff --git a/fs/ceph/file.c b/fs/ceph/file.c

index d70830c668333decf6969344e3c6cc2e327f2c0a..656e169074305f12d34859ca07ee710c4c1a954d 100644 (file)
--- a/fs/ceph/file.c
+++ b/fs/ceph/file.c
@@ -7,6 +7,7 @@
  #include <linux/mount.h>
  #include <linux/namei.h>
  #include <linux/writeback.h>
+#include <linux/aio.h>
  
  #include "super.h"
  #include "mds_client.h"
diff --git a/fs/compat.c b/fs/compat.c

index 93f7d021b716c01d0b2a9844f209aca7093c3e46..fc3b55dce184a2637fcbf14d1dc313e61714ad5d 100644 (file)
--- a/fs/compat.c
+++ b/fs/compat.c
@@ -47,6 +47,7 @@
  #include <linux/fs_struct.h>
  #include <linux/slab.h>
  #include <linux/pagemap.h>
+#include <linux/aio.h>
  
  #include <asm/uaccess.h>
  #include <asm/mmu_context.h>
diff --git a/fs/direct-io.c b/fs/direct-io.c

index cfb816dc6d9f65b73fdba6d0fadbe15041bf96ad..7ab90f5081eebc4ab8b0de88bef8d0b6310ed113 100644 (file)
--- a/fs/direct-io.c
+++ b/fs/direct-io.c
@@ -37,6 +37,7 @@
  #include <linux/uio.h>
  #include <linux/atomic.h>
  #include <linux/prefetch.h>
+#include <linux/aio.h>
  
  /*
   * How many user pages to map in one call to get_user_pages().  This determines
@@ -441,8 +442,8 @@ static struct bio *dio_await_one(struct dio *dio)
  static int dio_bio_complete(struct dio *dio, struct bio *bio)
  {
         const int uptodate = test_bit(BIO_UPTODATE, &bio->bi_flags);
-       struct bio_vec *bvec = bio->bi_io_vec;
-       int page_no;
+       struct bio_vec *bvec;
+       unsigned i;
  
         if (!uptodate)
                 dio->io_error = -EIO;
@@ -450,8 +451,8 @@ static int dio_bio_complete(struct dio *dio, struct bio *bio)
         if (dio->is_async && dio->rw == READ) {
                 bio_check_pages_dirty(bio);     /* transfers ownership */
         } else {
-               for (page_no = 0; page_no < bio->bi_vcnt; page_no++) {
-                       struct page *page = bvec[page_no].bv_page;
+               bio_for_each_segment_all(bvec, bio, i) {
+                       struct page *page = bvec->bv_page;
  
                         if (dio->rw == READ && !PageCompound(page))
                                 set_page_dirty_lock(page);
diff --git a/fs/ecryptfs/file.c b/fs/ecryptfs/file.c

index 63b1f54b6a1ff01862f381ae678afa3d46e4c15e..201f0a0d6b0a2a2be8a2983d2fb63ef27dacc0d8 100644 (file)
--- a/fs/ecryptfs/file.c
+++ b/fs/ecryptfs/file.c
@@ -31,6 +31,7 @@
  #include <linux/security.h>
  #include <linux/compat.h>
  #include <linux/fs_stack.h>
+#include <linux/aio.h>
  #include "ecryptfs_kernel.h"
  
  /**
diff --git a/fs/exofs/ore.c b/fs/exofs/ore.c

index f936cb50dc0d524250dae6d6ac5382db5c1f25b7..b744228886043d522fa15a32ff98cc80d29db3c9 100644 (file)
--- a/fs/exofs/ore.c
+++ b/fs/exofs/ore.c
@@ -401,7 +401,7 @@ static void _clear_bio(struct bio *bio)
         struct bio_vec *bv;
         unsigned i;
  
-       __bio_for_each_segment(bv, bio, i, 0) {
+       bio_for_each_segment_all(bv, bio, i) {
                 unsigned this_count = bv->bv_len;
  
                 if (likely(PAGE_SIZE == this_count))
diff --git a/fs/exofs/ore_raid.c b/fs/exofs/ore_raid.c

index b963f38ac298e82404befa6265c171aca4584aba..7682b970d0f1352cd019c8076ea55cc470ecd523 100644 (file)
--- a/fs/exofs/ore_raid.c
+++ b/fs/exofs/ore_raid.c
@@ -432,7 +432,7 @@ static void _mark_read4write_pages_uptodate(struct ore_io_state *ios, int ret)
                 if (!bio)
                         continue;
  
-               __bio_for_each_segment(bv, bio, i, 0) {
+               bio_for_each_segment_all(bv, bio, i) {
                         struct page *page = bv->bv_page;
  
                         SetPageUptodate(page);
diff --git a/fs/ext2/inode.c b/fs/ext2/inode.c

index fe60cc1117d834a1fbdc086b9aad01367518903b..0a87bb10998dc00070bc6d05d6f536c21de44e3a 100644 (file)
--- a/fs/ext2/inode.c
+++ b/fs/ext2/inode.c
@@ -31,6 +31,7 @@
  #include <linux/mpage.h>
  #include <linux/fiemap.h>
  #include <linux/namei.h>
+#include <linux/aio.h>
  #include "ext2.h"
  #include "acl.h"
  #include "xip.h"
diff --git a/fs/ext3/inode.c b/fs/ext3/inode.c

index d706dbfa62203cd67d15675b88ca32f5ddf5eb45..23c712825640926988883eedb7eefaf69d9c3250 100644 (file)
--- a/fs/ext3/inode.c
+++ b/fs/ext3/inode.c
@@ -27,6 +27,7 @@
  #include <linux/writeback.h>
  #include <linux/mpage.h>
  #include <linux/namei.h>
+#include <linux/aio.h>
  #include "ext3.h"
  #include "xattr.h"
  #include "acl.h"
diff --git a/fs/ext3/super.c b/fs/ext3/super.c

index 3dc48cc8b6eb318cfd5d8fdfce551c9d3808b728..6356665a74bb006a096023399fe8dca5363f1435 100644 (file)
--- a/fs/ext3/super.c
+++ b/fs/ext3/super.c
@@ -362,22 +362,19 @@ fail:
  /*
   * Release the journal device
   */
-static int ext3_blkdev_put(struct block_device *bdev)
+static void ext3_blkdev_put(struct block_device *bdev)
  {
-       return blkdev_put(bdev, FMODE_READ|FMODE_WRITE|FMODE_EXCL);
+       blkdev_put(bdev, FMODE_READ|FMODE_WRITE|FMODE_EXCL);
  }
  
-static int ext3_blkdev_remove(struct ext3_sb_info *sbi)
+static void ext3_blkdev_remove(struct ext3_sb_info *sbi)
  {
         struct block_device *bdev;
-       int ret = -ENODEV;
-
         bdev = sbi->journal_bdev;
         if (bdev) {
-               ret = ext3_blkdev_put(bdev);
+               ext3_blkdev_put(bdev);
                 sbi->journal_bdev = NULL;
         }
-       return ret;
  }
  
  static inline struct inode *orphan_list_entry(struct list_head *l)
diff --git a/fs/ext4/file.c b/fs/ext4/file.c

index 64848b595b243018d87c8a213066e90c590d48bb..4959e29573b68d0ed2fec94c28015c52cb22afb9 100644 (file)
--- a/fs/ext4/file.c
+++ b/fs/ext4/file.c
@@ -23,6 +23,7 @@
  #include <linux/jbd2.h>
  #include <linux/mount.h>
  #include <linux/path.h>
+#include <linux/aio.h>
  #include <linux/quotaops.h>
  #include <linux/pagevec.h>
  #include "ext4.h"
diff --git a/fs/ext4/indirect.c b/fs/ext4/indirect.c

index 98be6f6974637a4ebb7e4a8b1d471f8f587cdbe6..b8d5d351e24f64b1a5ee02bd90be59f5b19ce290 100644 (file)
--- a/fs/ext4/indirect.c
+++ b/fs/ext4/indirect.c
@@ -20,6 +20,7 @@
   *     (sct@redhat.com), 1993, 1998
   */
  
+#include <linux/aio.h>
  #include "ext4_jbd2.h"
  #include "truncate.h"
  #include "ext4_extents.h"      /* Needed for EXT_MAX_BLOCKS */
diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c

index 793d44b84d7f778fe3981892ad2ea6b76fe86a9f..0723774bdfb5cd3ef27600eaeac7062c330414ac 100644 (file)
--- a/fs/ext4/inode.c
+++ b/fs/ext4/inode.c
@@ -37,6 +37,7 @@
  #include <linux/printk.h>
  #include <linux/slab.h>
  #include <linux/ratelimit.h>
+#include <linux/aio.h>
  
  #include "ext4_jbd2.h"
  #include "xattr.h"
diff --git a/fs/ext4/page-io.c b/fs/ext4/page-io.c

index 5929cd0baa2077ebce9a3b0c51d528de9b89ac33..19599bded62a834be3bc52dbf68cfb0414ec564d 100644 (file)
--- a/fs/ext4/page-io.c
+++ b/fs/ext4/page-io.c
@@ -18,6 +18,7 @@
  #include <linux/pagevec.h>
  #include <linux/mpage.h>
  #include <linux/namei.h>
+#include <linux/aio.h>
  #include <linux/uio.h>
  #include <linux/bio.h>
  #include <linux/workqueue.h>
diff --git a/fs/ext4/super.c b/fs/ext4/super.c

index 24a146bde742be9683cea9f13be9e30a1324b9c5..94cc84db7c9aae349b44be229e993ec906a0a25a 100644 (file)
--- a/fs/ext4/super.c
+++ b/fs/ext4/super.c
@@ -703,22 +703,19 @@ fail:
  /*
   * Release the journal device
   */
-static int ext4_blkdev_put(struct block_device *bdev)
+static void ext4_blkdev_put(struct block_device *bdev)
  {
-       return blkdev_put(bdev, FMODE_READ|FMODE_WRITE|FMODE_EXCL);
+       blkdev_put(bdev, FMODE_READ|FMODE_WRITE|FMODE_EXCL);
  }
  
-static int ext4_blkdev_remove(struct ext4_sb_info *sbi)
+static void ext4_blkdev_remove(struct ext4_sb_info *sbi)
  {
         struct block_device *bdev;
-       int ret = -ENODEV;
-
         bdev = sbi->journal_bdev;
         if (bdev) {
-               ret = ext4_blkdev_put(bdev);
+               ext4_blkdev_put(bdev);
                 sbi->journal_bdev = NULL;
         }
-       return ret;
  }
  
  static inline struct inode *orphan_list_entry(struct list_head *l)
diff --git a/fs/f2fs/checkpoint.c b/fs/f2fs/checkpoint.c

index 2b6fc131e2ce44aceda047f83237a23b254cf590..b1de01da1a409b42ae5bc6aae4989c201c920639 100644 (file)
--- a/fs/f2fs/checkpoint.c
+++ b/fs/f2fs/checkpoint.c
@@ -20,6 +20,7 @@
  #include "f2fs.h"
  #include "node.h"
  #include "segment.h"
+#include <trace/events/f2fs.h>
  
  static struct kmem_cache *orphan_entry_slab;
  static struct kmem_cache *inode_entry_slab;
@@ -57,13 +58,19 @@ repeat:
                 cond_resched();
                 goto repeat;
         }
-       if (f2fs_readpage(sbi, page, index, READ_SYNC)) {
+       if (PageUptodate(page))
+               goto out;
+
+       if (f2fs_readpage(sbi, page, index, READ_SYNC))
+               goto repeat;
+
+       lock_page(page);
+       if (page->mapping != mapping) {
                 f2fs_put_page(page, 1);
                 goto repeat;
         }
+out:
         mark_page_accessed(page);
-
-       /* We do not allow returning an errorneous page */
         return page;
  }
  
@@ -541,54 +548,44 @@ retry:
   */
  static void block_operations(struct f2fs_sb_info *sbi)
  {
-       int t;
         struct writeback_control wbc = {
                 .sync_mode = WB_SYNC_ALL,
                 .nr_to_write = LONG_MAX,
                 .for_reclaim = 0,
         };
+       struct blk_plug plug;
  
-       /* Stop renaming operation */
-       mutex_lock_op(sbi, RENAME);
-       mutex_lock_op(sbi, DENTRY_OPS);
+       blk_start_plug(&plug);
  
-retry_dents:
-       /* write all the dirty dentry pages */
-       sync_dirty_dir_inodes(sbi);
+retry_flush_dents:
+       mutex_lock_all(sbi);
  
-       mutex_lock_op(sbi, DATA_WRITE);
+       /* write all the dirty dentry pages */
         if (get_pages(sbi, F2FS_DIRTY_DENTS)) {
-               mutex_unlock_op(sbi, DATA_WRITE);
-               goto retry_dents;
+               mutex_unlock_all(sbi);
+               sync_dirty_dir_inodes(sbi);
+               goto retry_flush_dents;
         }
  
-       /* block all the operations */
-       for (t = DATA_NEW; t <= NODE_TRUNC; t++)
-               mutex_lock_op(sbi, t);
-
-       mutex_lock(&sbi->write_inode);
-
         /*
          * POR: we should ensure that there is no dirty node pages
          * until finishing nat/sit flush.
          */
-retry:
-       sync_node_pages(sbi, 0, &wbc);
-
-       mutex_lock_op(sbi, NODE_WRITE);
+retry_flush_nodes:
+       mutex_lock(&sbi->node_write);
  
         if (get_pages(sbi, F2FS_DIRTY_NODES)) {
-               mutex_unlock_op(sbi, NODE_WRITE);
-               goto retry;
+               mutex_unlock(&sbi->node_write);
+               sync_node_pages(sbi, 0, &wbc);
+               goto retry_flush_nodes;
         }
-       mutex_unlock(&sbi->write_inode);
+       blk_finish_plug(&plug);
  }
  
  static void unblock_operations(struct f2fs_sb_info *sbi)
  {
-       int t;
-       for (t = NODE_WRITE; t >= RENAME; t--)
-               mutex_unlock_op(sbi, t);
+       mutex_unlock(&sbi->node_write);
+       mutex_unlock_all(sbi);
  }
  
  static void do_checkpoint(struct f2fs_sb_info *sbi, bool is_umount)
@@ -727,9 +724,13 @@ void write_checkpoint(struct f2fs_sb_info *sbi, bool is_umount)
         struct f2fs_checkpoint *ckpt = F2FS_CKPT(sbi);
         unsigned long long ckpt_ver;
  
+       trace_f2fs_write_checkpoint(sbi->sb, is_umount, "start block_ops");
+
         mutex_lock(&sbi->cp_mutex);
         block_operations(sbi);
  
+       trace_f2fs_write_checkpoint(sbi->sb, is_umount, "finish block_ops");
+
         f2fs_submit_bio(sbi, DATA, true);
         f2fs_submit_bio(sbi, NODE, true);
         f2fs_submit_bio(sbi, META, true);
@@ -746,13 +747,13 @@ void write_checkpoint(struct f2fs_sb_info *sbi, bool is_umount)
         flush_nat_entries(sbi);
         flush_sit_entries(sbi);
  
-       reset_victim_segmap(sbi);
-
         /* unlock all the fs_lock[] in do_checkpoint() */
         do_checkpoint(sbi, is_umount);
  
         unblock_operations(sbi);
         mutex_unlock(&sbi->cp_mutex);
+
+       trace_f2fs_write_checkpoint(sbi->sb, is_umount, "finish checkpoint");
  }
  
  void init_orphan_info(struct f2fs_sb_info *sbi)
diff --git a/fs/f2fs/data.c b/fs/f2fs/data.c

index 7bd22a201125636397c0d79d15a2822e83c130f4..91ff93b0b0f403300f951f0d26fa2698ac535c9a 100644 (file)
--- a/fs/f2fs/data.c
+++ b/fs/f2fs/data.c
@@ -12,6 +12,7 @@
  #include <linux/f2fs_fs.h>
  #include <linux/buffer_head.h>
  #include <linux/mpage.h>
+#include <linux/aio.h>
  #include <linux/writeback.h>
  #include <linux/backing-dev.h>
  #include <linux/blkdev.h>
@@ -21,6 +22,7 @@
  #include "f2fs.h"
  #include "node.h"
  #include "segment.h"
+#include <trace/events/f2fs.h>
  
  /*
   * Lock ordering for the change of data block address:
@@ -54,6 +56,8 @@ int reserve_new_block(struct dnode_of_data *dn)
         if (!inc_valid_block_count(sbi, dn->inode, 1))
                 return -ENOSPC;
  
+       trace_f2fs_reserve_new_block(dn->inode, dn->nid, dn->ofs_in_node);
+
         __set_data_blkaddr(dn, NEW_ADDR);
         dn->data_blkaddr = NEW_ADDR;
         sync_inode_page(dn);
@@ -133,7 +137,7 @@ void update_extent_cache(block_t blk_addr, struct dnode_of_data *dn)
                 goto end_update;
         }
  
-       /* Frone merge */
+       /* Front merge */
         if (fofs == start_fofs - 1 && blk_addr == start_blkaddr - 1) {
                 fi->ext.fofs--;
                 fi->ext.blk_addr--;
@@ -169,7 +173,7 @@ end_update:
         return;
  }
  
-struct page *find_data_page(struct inode *inode, pgoff_t index)
+struct page *find_data_page(struct inode *inode, pgoff_t index, bool sync)
  {
         struct f2fs_sb_info *sbi = F2FS_SB(inode->i_sb);
         struct address_space *mapping = inode->i_mapping;
@@ -183,7 +187,7 @@ struct page *find_data_page(struct inode *inode, pgoff_t index)
         f2fs_put_page(page, 0);
  
         set_new_dnode(&dn, inode, NULL, NULL, 0);
-       err = get_dnode_of_data(&dn, index, RDONLY_NODE);
+       err = get_dnode_of_data(&dn, index, LOOKUP_NODE);
         if (err)
                 return ERR_PTR(err);
         f2fs_put_dnode(&dn);
@@ -199,12 +203,20 @@ struct page *find_data_page(struct inode *inode, pgoff_t index)
         if (!page)
                 return ERR_PTR(-ENOMEM);
  
-       err = f2fs_readpage(sbi, page, dn.data_blkaddr, READ_SYNC);
-       if (err) {
-               f2fs_put_page(page, 1);
-               return ERR_PTR(err);
+       if (PageUptodate(page)) {
+               unlock_page(page);
+               return page;
+       }
+
+       err = f2fs_readpage(sbi, page, dn.data_blkaddr,
+                                       sync ? READ_SYNC : READA);
+       if (sync) {
+               wait_on_page_locked(page);
+               if (!PageUptodate(page)) {
+                       f2fs_put_page(page, 0);
+                       return ERR_PTR(-EIO);
+               }
         }
-       unlock_page(page);
         return page;
  }
  
@@ -222,14 +234,14 @@ struct page *get_lock_data_page(struct inode *inode, pgoff_t index)
         int err;
  
         set_new_dnode(&dn, inode, NULL, NULL, 0);
-       err = get_dnode_of_data(&dn, index, RDONLY_NODE);
+       err = get_dnode_of_data(&dn, index, LOOKUP_NODE);
         if (err)
                 return ERR_PTR(err);
         f2fs_put_dnode(&dn);
  
         if (dn.data_blkaddr == NULL_ADDR)
                 return ERR_PTR(-ENOENT);
-
+repeat:
         page = grab_cache_page(mapping, index);
         if (!page)
                 return ERR_PTR(-ENOMEM);
@@ -241,9 +253,17 @@ struct page *get_lock_data_page(struct inode *inode, pgoff_t index)
         BUG_ON(dn.data_blkaddr == NULL_ADDR);
  
         err = f2fs_readpage(sbi, page, dn.data_blkaddr, READ_SYNC);
-       if (err) {
-               f2fs_put_page(page, 1);
+       if (err)
                 return ERR_PTR(err);
+
+       lock_page(page);
+       if (!PageUptodate(page)) {
+               f2fs_put_page(page, 1);
+               return ERR_PTR(-EIO);
+       }
+       if (page->mapping != mapping) {
+               f2fs_put_page(page, 1);
+               goto repeat;
         }
         return page;
  }
@@ -251,6 +271,9 @@ struct page *get_lock_data_page(struct inode *inode, pgoff_t index)
  /*
   * Caller ensures that this data page is never allocated.
   * A new zero-filled data page is allocated in the page cache.
+ *
+ * Also, caller should grab and release a mutex by calling mutex_lock_op() and
+ * mutex_unlock_op().
   */
  struct page *get_new_data_page(struct inode *inode, pgoff_t index,
                                                 bool new_i_size)
@@ -262,7 +285,7 @@ struct page *get_new_data_page(struct inode *inode, pgoff_t index,
         int err;
  
         set_new_dnode(&dn, inode, NULL, NULL, 0);
-       err = get_dnode_of_data(&dn, index, 0);
+       err = get_dnode_of_data(&dn, index, ALLOC_NODE);
         if (err)
                 return ERR_PTR(err);
  
@@ -273,7 +296,7 @@ struct page *get_new_data_page(struct inode *inode, pgoff_t index,
                 }
         }
         f2fs_put_dnode(&dn);
-
+repeat:
         page = grab_cache_page(mapping, index);
         if (!page)
                 return ERR_PTR(-ENOMEM);
@@ -283,14 +306,21 @@ struct page *get_new_data_page(struct inode *inode, pgoff_t index,
  
         if (dn.data_blkaddr == NEW_ADDR) {
                 zero_user_segment(page, 0, PAGE_CACHE_SIZE);
+               SetPageUptodate(page);
         } else {
                 err = f2fs_readpage(sbi, page, dn.data_blkaddr, READ_SYNC);
-               if (err) {
-                       f2fs_put_page(page, 1);
+               if (err)
                         return ERR_PTR(err);
+               lock_page(page);
+               if (!PageUptodate(page)) {
+                       f2fs_put_page(page, 1);
+                       return ERR_PTR(-EIO);
+               }
+               if (page->mapping != mapping) {
+                       f2fs_put_page(page, 1);
+                       goto repeat;
                 }
         }
-       SetPageUptodate(page);
  
         if (new_i_size &&
                 i_size_read(inode) < ((index + 1) << PAGE_CACHE_SHIFT)) {
@@ -325,21 +355,15 @@ static void read_end_io(struct bio *bio, int err)
  
  /*
   * Fill the locked page with data located in the block address.
- * Read operation is synchronous, and caller must unlock the page.
+ * Return unlocked page.
   */
  int f2fs_readpage(struct f2fs_sb_info *sbi, struct page *page,
                                         block_t blk_addr, int type)
  {
         struct block_device *bdev = sbi->sb->s_bdev;
-       bool sync = (type == READ_SYNC);
         struct bio *bio;
  
-       /* This page can be already read by other threads */
-       if (PageUptodate(page)) {
-               if (!sync)
-                       unlock_page(page);
-               return 0;
-       }
+       trace_f2fs_readpage(page, blk_addr, type);
  
         down_read(&sbi->bio_sem);
  
@@ -354,18 +378,12 @@ int f2fs_readpage(struct f2fs_sb_info *sbi, struct page *page,
                 kfree(bio->bi_private);
                 bio_put(bio);
                 up_read(&sbi->bio_sem);
+               f2fs_put_page(page, 1);
                 return -EFAULT;
         }
  
         submit_bio(type, bio);
         up_read(&sbi->bio_sem);
-
-       /* wait for read completion if sync */
-       if (sync) {
-               lock_page(page);
-               if (PageError(page))
-                       return -EIO;
-       }
         return 0;
  }
  
@@ -387,14 +405,18 @@ static int get_data_block_ro(struct inode *inode, sector_t iblock,
         /* Get the page offset from the block offset(iblock) */
         pgofs = (pgoff_t)(iblock >> (PAGE_CACHE_SHIFT - blkbits));
  
-       if (check_extent_cache(inode, pgofs, bh_result))
+       if (check_extent_cache(inode, pgofs, bh_result)) {
+               trace_f2fs_get_data_block(inode, iblock, bh_result, 0);
                 return 0;
+       }
  
         /* When reading holes, we need its node page */
         set_new_dnode(&dn, inode, NULL, NULL, 0);
-       err = get_dnode_of_data(&dn, pgofs, RDONLY_NODE);
-       if (err)
+       err = get_dnode_of_data(&dn, pgofs, LOOKUP_NODE_RA);
+       if (err) {
+               trace_f2fs_get_data_block(inode, iblock, bh_result, err);
                 return (err == -ENOENT) ? 0 : err;
+       }
  
         /* It does not support data allocation */
         BUG_ON(create);
@@ -419,6 +441,7 @@ static int get_data_block_ro(struct inode *inode, sector_t iblock,
                 bh_result->b_size = (i << blkbits);
         }
         f2fs_put_dnode(&dn);
+       trace_f2fs_get_data_block(inode, iblock, bh_result, 0);
         return 0;
  }
  
@@ -437,13 +460,12 @@ static int f2fs_read_data_pages(struct file *file,
  int do_write_data_page(struct page *page)
  {
         struct inode *inode = page->mapping->host;
-       struct f2fs_sb_info *sbi = F2FS_SB(inode->i_sb);
         block_t old_blk_addr, new_blk_addr;
         struct dnode_of_data dn;
         int err = 0;
  
         set_new_dnode(&dn, inode, NULL, NULL, 0);
-       err = get_dnode_of_data(&dn, page->index, RDONLY_NODE);
+       err = get_dnode_of_data(&dn, page->index, LOOKUP_NODE);
         if (err)
                 return err;
  
@@ -467,8 +489,6 @@ int do_write_data_page(struct page *page)
                 write_data_page(inode, page, &dn,
                                 old_blk_addr, &new_blk_addr);
                 update_extent_cache(new_blk_addr, &dn);
-               F2FS_I(inode)->data_version =
-                       le64_to_cpu(F2FS_CKPT(sbi)->checkpoint_ver);
         }
  out_writepage:
         f2fs_put_dnode(&dn);
@@ -484,10 +504,11 @@ static int f2fs_write_data_page(struct page *page,
         const pgoff_t end_index = ((unsigned long long) i_size)
                                                         >> PAGE_CACHE_SHIFT;
         unsigned offset;
+       bool need_balance_fs = false;
         int err = 0;
  
         if (page->index < end_index)
-               goto out;
+               goto write;
  
         /*
          * If the offset is out-of-range of file size,
@@ -499,50 +520,46 @@ static int f2fs_write_data_page(struct page *page,
                         dec_page_count(sbi, F2FS_DIRTY_DENTS);
                         inode_dec_dirty_dents(inode);
                 }
-               goto unlock_out;
+               goto out;
         }
  
         zero_user_segment(page, offset, PAGE_CACHE_SIZE);
-out:
-       if (sbi->por_doing)
-               goto redirty_out;
-
-       if (wbc->for_reclaim && !S_ISDIR(inode->i_mode) && !is_cold_data(page))
+write:
+       if (sbi->por_doing) {
+               err = AOP_WRITEPAGE_ACTIVATE;
                 goto redirty_out;
+       }
  
-       mutex_lock_op(sbi, DATA_WRITE);
+       /* Dentry blocks are controlled by checkpoint */
         if (S_ISDIR(inode->i_mode)) {
                 dec_page_count(sbi, F2FS_DIRTY_DENTS);
                 inode_dec_dirty_dents(inode);
+               err = do_write_data_page(page);
+       } else {
+               int ilock = mutex_lock_op(sbi);
+               err = do_write_data_page(page);
+               mutex_unlock_op(sbi, ilock);
+               need_balance_fs = true;
         }
-       err = do_write_data_page(page);
-       if (err && err != -ENOENT) {
-               wbc->pages_skipped++;
-               set_page_dirty(page);
-       }
-       mutex_unlock_op(sbi, DATA_WRITE);
+       if (err == -ENOENT)
+               goto out;
+       else if (err)
+               goto redirty_out;
  
         if (wbc->for_reclaim)
                 f2fs_submit_bio(sbi, DATA, true);
  
-       if (err == -ENOENT)
-               goto unlock_out;
-
         clear_cold_data(page);
+out:
         unlock_page(page);
-
-       if (!wbc->for_reclaim && !S_ISDIR(inode->i_mode))
+       if (need_balance_fs)
                 f2fs_balance_fs(sbi);
         return 0;
  
-unlock_out:
-       unlock_page(page);
-       return (err == -ENOENT) ? 0 : err;
-
  redirty_out:
         wbc->pages_skipped++;
         set_page_dirty(page);
-       return AOP_WRITEPAGE_ACTIVATE;
+       return err;
  }
  
  #define MAX_DESIRED_PAGES_WP   4096
@@ -561,19 +578,26 @@ static int f2fs_write_data_pages(struct address_space *mapping,
  {
         struct inode *inode = mapping->host;
         struct f2fs_sb_info *sbi = F2FS_SB(inode->i_sb);
+       bool locked = false;
         int ret;
         long excess_nrtw = 0, desired_nrtw;
  
+       /* deal with chardevs and other special file */
+       if (!mapping->a_ops->writepage)
+               return 0;
+
         if (wbc->nr_to_write < MAX_DESIRED_PAGES_WP) {
                 desired_nrtw = MAX_DESIRED_PAGES_WP;
                 excess_nrtw = desired_nrtw - wbc->nr_to_write;
                 wbc->nr_to_write = desired_nrtw;
         }
  
-       if (!S_ISDIR(inode->i_mode))
+       if (!S_ISDIR(inode->i_mode)) {
                 mutex_lock(&sbi->writepages);
+               locked = true;
+       }
         ret = write_cache_pages(mapping, wbc, __f2fs_writepage, mapping);
-       if (!S_ISDIR(inode->i_mode))
+       if (locked)
                 mutex_unlock(&sbi->writepages);
         f2fs_submit_bio(sbi, DATA, (wbc->sync_mode == WB_SYNC_ALL));
  
@@ -593,39 +617,33 @@ static int f2fs_write_begin(struct file *file, struct address_space *mapping,
         pgoff_t index = ((unsigned long long) pos) >> PAGE_CACHE_SHIFT;
         struct dnode_of_data dn;
         int err = 0;
+       int ilock;
  
         /* for nobh_write_end */
         *fsdata = NULL;
  
         f2fs_balance_fs(sbi);
-
+repeat:
         page = grab_cache_page_write_begin(mapping, index, flags);
         if (!page)
                 return -ENOMEM;
         *pagep = page;
  
-       mutex_lock_op(sbi, DATA_NEW);
+       ilock = mutex_lock_op(sbi);
  
         set_new_dnode(&dn, inode, NULL, NULL, 0);
-       err = get_dnode_of_data(&dn, index, 0);
-       if (err) {
-               mutex_unlock_op(sbi, DATA_NEW);
-               f2fs_put_page(page, 1);
-               return err;
-       }
+       err = get_dnode_of_data(&dn, index, ALLOC_NODE);
+       if (err)
+               goto err;
  
-       if (dn.data_blkaddr == NULL_ADDR) {
+       if (dn.data_blkaddr == NULL_ADDR)
                 err = reserve_new_block(&dn);
-               if (err) {
-                       f2fs_put_dnode(&dn);
-                       mutex_unlock_op(sbi, DATA_NEW);
-                       f2fs_put_page(page, 1);
-                       return err;
-               }
-       }
+
         f2fs_put_dnode(&dn);
+       if (err)
+               goto err;
  
-       mutex_unlock_op(sbi, DATA_NEW);
+       mutex_unlock_op(sbi, ilock);
  
         if ((len == PAGE_CACHE_SIZE) || PageUptodate(page))
                 return 0;
@@ -636,21 +654,34 @@ static int f2fs_write_begin(struct file *file, struct address_space *mapping,
  
                 /* Reading beyond i_size is simple: memset to zero */
                 zero_user_segments(page, 0, start, end, PAGE_CACHE_SIZE);
-               return 0;
+               goto out;
         }
  
         if (dn.data_blkaddr == NEW_ADDR) {
                 zero_user_segment(page, 0, PAGE_CACHE_SIZE);
         } else {
                 err = f2fs_readpage(sbi, page, dn.data_blkaddr, READ_SYNC);
-               if (err) {
-                       f2fs_put_page(page, 1);
+               if (err)
                         return err;
+               lock_page(page);
+               if (!PageUptodate(page)) {
+                       f2fs_put_page(page, 1);
+                       return -EIO;
+               }
+               if (page->mapping != mapping) {
+                       f2fs_put_page(page, 1);
+                       goto repeat;
                 }
         }
+out:
         SetPageUptodate(page);
         clear_cold_data(page);
         return 0;
+
+err:
+       mutex_unlock_op(sbi, ilock);
+       f2fs_put_page(page, 1);
+       return err;
  }
  
  static ssize_t f2fs_direct_IO(int rw, struct kiocb *iocb,
@@ -681,7 +712,7 @@ static void f2fs_invalidate_data_page(struct page *page, unsigned long offset)
  static int f2fs_release_data_page(struct page *page, gfp_t wait)
  {
         ClearPagePrivate(page);
-       return 0;
+       return 1;
  }
  
  static int f2fs_set_data_page_dirty(struct page *page)
diff --git a/fs/f2fs/debug.c b/fs/f2fs/debug.c

index 025b9e2f935d811566d9fa1f2d2b1e3e738da73d..8d9943786c318effc179b281ea5d4d34947033ff 100644 (file)
--- a/fs/f2fs/debug.c
+++ b/fs/f2fs/debug.c
@@ -13,7 +13,6 @@
  
  #include <linux/fs.h>
  #include <linux/backing-dev.h>
-#include <linux/proc_fs.h>
  #include <linux/f2fs_fs.h>
  #include <linux/blkdev.h>
  #include <linux/debugfs.h>
@@ -106,7 +105,7 @@ static void update_sit_info(struct f2fs_sb_info *sbi)
                 }
         }
         mutex_unlock(&sit_i->sentry_lock);
-       dist = sbi->total_sections * hblks_per_sec * hblks_per_sec / 100;
+       dist = TOTAL_SECS(sbi) * hblks_per_sec * hblks_per_sec / 100;
         si->bimodal = bimodal / dist;
         if (si->dirty_count)
                 si->avg_vblocks = total_vblocks / ndirty;
@@ -138,14 +137,13 @@ static void update_mem_info(struct f2fs_sb_info *sbi)
         si->base_mem += f2fs_bitmap_size(TOTAL_SEGS(sbi));
         si->base_mem += 2 * SIT_VBLOCK_MAP_SIZE * TOTAL_SEGS(sbi);
         if (sbi->segs_per_sec > 1)
-               si->base_mem += sbi->total_sections *
-                       sizeof(struct sec_entry);
+               si->base_mem += TOTAL_SECS(sbi) * sizeof(struct sec_entry);
         si->base_mem += __bitmap_size(sbi, SIT_BITMAP);
  
         /* build free segmap */
         si->base_mem += sizeof(struct free_segmap_info);
         si->base_mem += f2fs_bitmap_size(TOTAL_SEGS(sbi));
-       si->base_mem += f2fs_bitmap_size(sbi->total_sections);
+       si->base_mem += f2fs_bitmap_size(TOTAL_SECS(sbi));
  
         /* build curseg */
         si->base_mem += sizeof(struct curseg_info) * NR_CURSEG_TYPE;
@@ -154,7 +152,7 @@ static void update_mem_info(struct f2fs_sb_info *sbi)
         /* build dirty segmap */
         si->base_mem += sizeof(struct dirty_seglist_info);
         si->base_mem += NR_DIRTY_TYPE * f2fs_bitmap_size(TOTAL_SEGS(sbi));
-       si->base_mem += 2 * f2fs_bitmap_size(TOTAL_SEGS(sbi));
+       si->base_mem += f2fs_bitmap_size(TOTAL_SECS(sbi));
  
         /* buld nm */
         si->base_mem += sizeof(struct f2fs_nm_info);
diff --git a/fs/f2fs/dir.c b/fs/f2fs/dir.c

index 1be948768e2f1b04293ea19935db4eb244d62049..1ac6b93036b7a23980a66d4f6987a82c34ff32df 100644 (file)
--- a/fs/f2fs/dir.c
+++ b/fs/f2fs/dir.c
@@ -148,7 +148,7 @@ static struct f2fs_dir_entry *find_in_level(struct inode *dir,
  
         for (; bidx < end_block; bidx++) {
                 /* no need to allocate new dentry pages to all the indices */
-               dentry_page = find_data_page(dir, bidx);
+               dentry_page = find_data_page(dir, bidx, true);
                 if (IS_ERR(dentry_page)) {
                         room = true;
                         continue;
@@ -189,6 +189,9 @@ struct f2fs_dir_entry *f2fs_find_entry(struct inode *dir,
         unsigned int max_depth;
         unsigned int level;
  
+       if (namelen > F2FS_NAME_LEN)
+               return NULL;
+
         if (npages == 0)
                 return NULL;
  
@@ -246,9 +249,6 @@ ino_t f2fs_inode_by_name(struct inode *dir, struct qstr *qstr)
  void f2fs_set_link(struct inode *dir, struct f2fs_dir_entry *de,
                 struct page *page, struct inode *inode)
  {
-       struct f2fs_sb_info *sbi = F2FS_SB(dir->i_sb);
-
-       mutex_lock_op(sbi, DENTRY_OPS);
         lock_page(page);
         wait_on_page_writeback(page);
         de->ino = cpu_to_le32(inode->i_ino);
@@ -262,7 +262,6 @@ void f2fs_set_link(struct inode *dir, struct f2fs_dir_entry *de,
         F2FS_I(inode)->i_pino = dir->i_ino;
  
         f2fs_put_page(page, 1);
-       mutex_unlock_op(sbi, DENTRY_OPS);
  }
  
  void init_dent_inode(const struct qstr *name, struct page *ipage)
@@ -281,6 +280,43 @@ void init_dent_inode(const struct qstr *name, struct page *ipage)
         set_page_dirty(ipage);
  }
  
+static int make_empty_dir(struct inode *inode, struct inode *parent)
+{
+       struct page *dentry_page;
+       struct f2fs_dentry_block *dentry_blk;
+       struct f2fs_dir_entry *de;
+       void *kaddr;
+
+       dentry_page = get_new_data_page(inode, 0, true);
+       if (IS_ERR(dentry_page))
+               return PTR_ERR(dentry_page);
+
+       kaddr = kmap_atomic(dentry_page);
+       dentry_blk = (struct f2fs_dentry_block *)kaddr;
+
+       de = &dentry_blk->dentry[0];
+       de->name_len = cpu_to_le16(1);
+       de->hash_code = 0;
+       de->ino = cpu_to_le32(inode->i_ino);
+       memcpy(dentry_blk->filename[0], ".", 1);
+       set_de_type(de, inode);
+
+       de = &dentry_blk->dentry[1];
+       de->hash_code = 0;
+       de->name_len = cpu_to_le16(2);
+       de->ino = cpu_to_le32(parent->i_ino);
+       memcpy(dentry_blk->filename[1], "..", 2);
+       set_de_type(de, inode);
+
+       test_and_set_bit_le(0, &dentry_blk->dentry_bitmap);
+       test_and_set_bit_le(1, &dentry_blk->dentry_bitmap);
+       kunmap_atomic(kaddr);
+
+       set_page_dirty(dentry_page);
+       f2fs_put_page(dentry_page, 1);
+       return 0;
+}
+
  static int init_inode_metadata(struct inode *inode,
                 struct inode *dir, const struct qstr *name)
  {
@@ -291,7 +327,7 @@ static int init_inode_metadata(struct inode *inode,
                         return err;
  
                 if (S_ISDIR(inode->i_mode)) {
-                       err = f2fs_make_empty(inode, dir);
+                       err = make_empty_dir(inode, dir);
                         if (err) {
                                 remove_inode_page(inode);
                                 return err;
@@ -314,7 +350,7 @@ static int init_inode_metadata(struct inode *inode,
         }
         if (is_inode_flag_set(F2FS_I(inode), FI_INC_LINK)) {
                 inc_nlink(inode);
-               f2fs_write_inode(inode, NULL);
+               update_inode_page(inode);
         }
         return 0;
  }
@@ -338,7 +374,7 @@ static void update_parent_metadata(struct inode *dir, struct inode *inode,
         }
  
         if (need_dir_update)
-               f2fs_write_inode(dir, NULL);
+               update_inode_page(dir);
         else
                 mark_inode_dirty(dir);
  
@@ -370,6 +406,10 @@ next:
         goto next;
  }
  
+/*
+ * Caller should grab and release a mutex by calling mutex_lock_op() and
+ * mutex_unlock_op().
+ */
  int __f2fs_add_link(struct inode *dir, const struct qstr *name, struct inode *inode)
  {
         unsigned int bit_pos;
@@ -379,7 +419,6 @@ int __f2fs_add_link(struct inode *dir, const struct qstr *name, struct inode *in
         f2fs_hash_t dentry_hash;
         struct f2fs_dir_entry *de;
         unsigned int nbucket, nblock;
-       struct f2fs_sb_info *sbi = F2FS_SB(dir->i_sb);
         size_t namelen = name->len;
         struct page *dentry_page = NULL;
         struct f2fs_dentry_block *dentry_blk = NULL;
@@ -409,12 +448,9 @@ start:
         bidx = dir_block_index(level, (le32_to_cpu(dentry_hash) % nbucket));
  
         for (block = bidx; block <= (bidx + nblock - 1); block++) {
-               mutex_lock_op(sbi, DENTRY_OPS);
                 dentry_page = get_new_data_page(dir, block, true);
-               if (IS_ERR(dentry_page)) {
-                       mutex_unlock_op(sbi, DENTRY_OPS);
+               if (IS_ERR(dentry_page))
                         return PTR_ERR(dentry_page);
-               }
  
                 dentry_blk = kmap(dentry_page);
                 bit_pos = room_for_filename(dentry_blk, slots);
@@ -423,7 +459,6 @@ start:
  
                 kunmap(dentry_page);
                 f2fs_put_page(dentry_page, 1);
-               mutex_unlock_op(sbi, DENTRY_OPS);
         }
  
         /* Move to next level to find the empty slot for new dentry */
@@ -453,7 +488,6 @@ add_dentry:
  fail:
         kunmap(dentry_page);
         f2fs_put_page(dentry_page, 1);
-       mutex_unlock_op(sbi, DENTRY_OPS);
         return err;
  }
  
@@ -473,8 +507,6 @@ void f2fs_delete_entry(struct f2fs_dir_entry *dentry, struct page *page,
         void *kaddr = page_address(page);
         int i;
  
-       mutex_lock_op(sbi, DENTRY_OPS);
-
         lock_page(page);
         wait_on_page_writeback(page);
  
@@ -494,7 +526,7 @@ void f2fs_delete_entry(struct f2fs_dir_entry *dentry, struct page *page,
  
         if (inode && S_ISDIR(inode->i_mode)) {
                 drop_nlink(dir);
-               f2fs_write_inode(dir, NULL);
+               update_inode_page(dir);
         } else {
                 mark_inode_dirty(dir);
         }
@@ -506,7 +538,8 @@ void f2fs_delete_entry(struct f2fs_dir_entry *dentry, struct page *page,
                         drop_nlink(inode);
                         i_size_write(inode, 0);
                 }
-               f2fs_write_inode(inode, NULL);
+               update_inode_page(inode);
+
                 if (inode->i_nlink == 0)
                         add_orphan_inode(sbi, inode->i_ino);
         }
@@ -519,45 +552,6 @@ void f2fs_delete_entry(struct f2fs_dir_entry *dentry, struct page *page,
                 inode_dec_dirty_dents(dir);
         }
         f2fs_put_page(page, 1);
-
-       mutex_unlock_op(sbi, DENTRY_OPS);
-}
-
-int f2fs_make_empty(struct inode *inode, struct inode *parent)
-{
-       struct page *dentry_page;
-       struct f2fs_dentry_block *dentry_blk;
-       struct f2fs_dir_entry *de;
-       void *kaddr;
-
-       dentry_page = get_new_data_page(inode, 0, true);
-       if (IS_ERR(dentry_page))
-               return PTR_ERR(dentry_page);
-
-       kaddr = kmap_atomic(dentry_page);
-       dentry_blk = (struct f2fs_dentry_block *)kaddr;
-
-       de = &dentry_blk->dentry[0];
-       de->name_len = cpu_to_le16(1);
-       de->hash_code = f2fs_dentry_hash(".", 1);
-       de->ino = cpu_to_le32(inode->i_ino);
-       memcpy(dentry_blk->filename[0], ".", 1);
-       set_de_type(de, inode);
-
-       de = &dentry_blk->dentry[1];
-       de->hash_code = f2fs_dentry_hash("..", 2);
-       de->name_len = cpu_to_le16(2);
-       de->ino = cpu_to_le32(parent->i_ino);
-       memcpy(dentry_blk->filename[1], "..", 2);
-       set_de_type(de, inode);
-
-       test_and_set_bit_le(0, &dentry_blk->dentry_bitmap);
-       test_and_set_bit_le(1, &dentry_blk->dentry_bitmap);
-       kunmap_atomic(kaddr);
-
-       set_page_dirty(dentry_page);
-       f2fs_put_page(dentry_page, 1);
-       return 0;
  }
  
  bool f2fs_empty_dir(struct inode *dir)
diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h

index 201c8d3b0f863f829dae24de708c9077576dc763..20aab02f2a427181a4240ad99ad2b8200a235509 100644 (file)
--- a/fs/f2fs/f2fs.h
+++ b/fs/f2fs/f2fs.h
@@ -125,11 +125,15 @@ static inline int update_sits_in_cursum(struct f2fs_summary_block *rs, int i)
                                          * file keeping -1 as its node offset to
                                          * distinguish from index node blocks.
                                          */
-#define RDONLY_NODE            1       /*
-                                        * specify a read-only mode when getting
-                                        * a node block. 0 is read-write mode.
-                                        * used by get_dnode_of_data().
+enum {
+       ALLOC_NODE,                     /* allocate a new node page if needed */
+       LOOKUP_NODE,                    /* look up a node without readahead */
+       LOOKUP_NODE_RA,                 /*
+                                        * look up a node with readahead called
+                                        * by get_datablock_ro.
                                          */
+};
+
  #define F2FS_LINK_MAX          32000   /* maximum link count per file */
  
  /* for in-memory extent cache entry */
@@ -144,6 +148,7 @@ struct extent_info {
   * i_advise uses FADVISE_XXX_BIT. We can add additional hints later.
   */
  #define FADVISE_COLD_BIT       0x01
+#define FADVISE_CP_BIT         0x02
  
  struct f2fs_inode_info {
         struct inode vfs_inode;         /* serve a vfs inode */
@@ -155,7 +160,6 @@ struct f2fs_inode_info {
  
         /* Use below internally in f2fs*/
         unsigned long flags;            /* use to pass per-file flags */
-       unsigned long long data_version;/* latest version of data for fsync */
         atomic_t dirty_dents;           /* # of dirty dentry pages */
         f2fs_hash_t chash;              /* hash value of given file name */
         unsigned int clevel;            /* maximum level of given file name */
@@ -186,7 +190,6 @@ static inline void set_raw_extent(struct extent_info *ext,
  struct f2fs_nm_info {
         block_t nat_blkaddr;            /* base disk address of NAT */
         nid_t max_nid;                  /* maximum possible node ids */
-       nid_t init_scan_nid;            /* the first nid to be scanned */
         nid_t next_scan_nid;            /* the next nid to be scanned */
  
         /* NAT cache management */
@@ -305,23 +308,12 @@ enum count_type {
  };
  
  /*
- * FS_LOCK nesting subclasses for the lock validator:
- *
- * The locking order between these classes is
- * RENAME -> DENTRY_OPS -> DATA_WRITE -> DATA_NEW
- *    -> DATA_TRUNC -> NODE_WRITE -> NODE_NEW -> NODE_TRUNC
+ * Uses as sbi->fs_lock[NR_GLOBAL_LOCKS].
+ * The checkpoint procedure blocks all the locks in this fs_lock array.
+ * Some FS operations grab free locks, and if there is no free lock,
+ * then wait to grab a lock in a round-robin manner.
   */
-enum lock_type {
-       RENAME,         /* for renaming operations */
-       DENTRY_OPS,     /* for directory operations */
-       DATA_WRITE,     /* for data write */
-       DATA_NEW,       /* for data allocation */
-       DATA_TRUNC,     /* for data truncate */
-       NODE_NEW,       /* for node allocation */
-       NODE_TRUNC,     /* for node truncate */
-       NODE_WRITE,     /* for node write */
-       NR_LOCK_TYPE,
-};
+#define NR_GLOBAL_LOCKS        8
  
  /*
   * The below are the page types of bios used in submti_bio().
@@ -361,11 +353,13 @@ struct f2fs_sb_info {
         /* for checkpoint */
         struct f2fs_checkpoint *ckpt;           /* raw checkpoint pointer */
         struct inode *meta_inode;               /* cache meta blocks */
-       struct mutex cp_mutex;                  /* for checkpoint procedure */
-       struct mutex fs_lock[NR_LOCK_TYPE];     /* for blocking FS operations */
-       struct mutex write_inode;               /* mutex for write inode */
+       struct mutex cp_mutex;                  /* checkpoint procedure lock */
+       struct mutex fs_lock[NR_GLOBAL_LOCKS];  /* blocking FS operations */
+       struct mutex node_write;                /* locking node writes */
         struct mutex writepages;                /* mutex for writepages() */
+       unsigned char next_lock_num;            /* round-robin global locks */
         int por_doing;                          /* recovery is doing or not */
+       int on_build_free_nids;                 /* build_free_nids is doing */
  
         /* for orphan inode management */
         struct list_head orphan_inode_list;     /* orphan inode list */
@@ -406,6 +400,7 @@ struct f2fs_sb_info {
         /* for cleaning operations */
         struct mutex gc_mutex;                  /* mutex for GC */
         struct f2fs_gc_kthread  *gc_thread;     /* GC thread */
+       unsigned int cur_victim_sec;            /* current victim section num */
  
         /*
          * for stat information.
@@ -498,22 +493,51 @@ static inline void clear_ckpt_flags(struct f2fs_checkpoint *cp, unsigned int f)
         cp->ckpt_flags = cpu_to_le32(ckpt_flags);
  }
  
-static inline void mutex_lock_op(struct f2fs_sb_info *sbi, enum lock_type t)
+static inline void mutex_lock_all(struct f2fs_sb_info *sbi)
  {
-       mutex_lock_nested(&sbi->fs_lock[t], t);
+       int i = 0;
+       for (; i < NR_GLOBAL_LOCKS; i++)
+               mutex_lock(&sbi->fs_lock[i]);
  }
  
-static inline void mutex_unlock_op(struct f2fs_sb_info *sbi, enum lock_type t)
+static inline void mutex_unlock_all(struct f2fs_sb_info *sbi)
  {
-       mutex_unlock(&sbi->fs_lock[t]);
+       int i = 0;
+       for (; i < NR_GLOBAL_LOCKS; i++)
+               mutex_unlock(&sbi->fs_lock[i]);
+}
+
+static inline int mutex_lock_op(struct f2fs_sb_info *sbi)
+{
+       unsigned char next_lock = sbi->next_lock_num % NR_GLOBAL_LOCKS;
+       int i = 0;
+
+       for (; i < NR_GLOBAL_LOCKS; i++)
+               if (mutex_trylock(&sbi->fs_lock[i]))
+                       return i;
+
+       mutex_lock(&sbi->fs_lock[next_lock]);
+       sbi->next_lock_num++;
+       return next_lock;
+}
+
+static inline void mutex_unlock_op(struct f2fs_sb_info *sbi, int ilock)
+{
+       if (ilock < 0)
+               return;
+       BUG_ON(ilock >= NR_GLOBAL_LOCKS);
+       mutex_unlock(&sbi->fs_lock[ilock]);
  }
  
  /*
   * Check whether the given nid is within node id range.
   */
-static inline void check_nid_range(struct f2fs_sb_info *sbi, nid_t nid)
+static inline int check_nid_range(struct f2fs_sb_info *sbi, nid_t nid)
  {
-       BUG_ON((nid >= NM_I(sbi)->max_nid));
+       WARN_ON((nid >= NM_I(sbi)->max_nid));
+       if (nid >= NM_I(sbi)->max_nid)
+               return -EINVAL;
+       return 0;
  }
  
  #define F2FS_DEFAULT_ALLOCATED_BLOCKS  1
@@ -819,7 +843,6 @@ static inline int f2fs_clear_bit(unsigned int nr, char *addr)
  /* used for f2fs_inode_info->flags */
  enum {
         FI_NEW_INODE,           /* indicate newly allocated inode */
-       FI_NEED_CP,             /* need to do checkpoint during fsync */
         FI_INC_LINK,            /* need to increment i_nlink */
         FI_ACL_MODE,            /* indicate acl mode */
         FI_NO_ALLOC,            /* should not allocate any blocks */
@@ -872,6 +895,7 @@ long f2fs_compat_ioctl(struct file *, unsigned int, unsigned long);
  void f2fs_set_inode_flags(struct inode *);
  struct inode *f2fs_iget(struct super_block *, unsigned long);
  void update_inode(struct inode *, struct page *);
+int update_inode_page(struct inode *);
  int f2fs_write_inode(struct inode *, struct writeback_control *);
  void f2fs_evict_inode(struct inode *);
  
@@ -973,7 +997,6 @@ int lookup_journal_in_cursum(struct f2fs_summary_block *,
                                         int, unsigned int, int);
  void flush_sit_entries(struct f2fs_sb_info *);
  int build_segment_manager(struct f2fs_sb_info *);
-void reset_victim_segmap(struct f2fs_sb_info *);
  void destroy_segment_manager(struct f2fs_sb_info *);
  
  /*
@@ -1000,7 +1023,7 @@ void destroy_checkpoint_caches(void);
   */
  int reserve_new_block(struct dnode_of_data *);
  void update_extent_cache(block_t, struct dnode_of_data *);
-struct page *find_data_page(struct inode *, pgoff_t);
+struct page *find_data_page(struct inode *, pgoff_t, bool);
  struct page *get_lock_data_page(struct inode *, pgoff_t);
  struct page *get_new_data_page(struct inode *, pgoff_t, bool);
  int f2fs_readpage(struct f2fs_sb_info *, struct page *, block_t, int);
@@ -1020,7 +1043,7 @@ void destroy_gc_caches(void);
  /*
   * recovery.c
   */
-void recover_fsync_data(struct f2fs_sb_info *);
+int recover_fsync_data(struct f2fs_sb_info *);
  bool space_for_roll_forward(struct f2fs_sb_info *);
  
  /*
diff --git a/fs/f2fs/file.c b/fs/f2fs/file.c

index db626282d424e49fff3669a57f1d29dee1b08407..1cae864f8dfcd03c676a8e02b1b2261a94af4f21 100644 (file)
--- a/fs/f2fs/file.c
+++ b/fs/f2fs/file.c
@@ -13,6 +13,7 @@
  #include <linux/stat.h>
  #include <linux/buffer_head.h>
  #include <linux/writeback.h>
+#include <linux/blkdev.h>
  #include <linux/falloc.h>
  #include <linux/types.h>
  #include <linux/compat.h>
@@ -24,6 +25,7 @@
  #include "segment.h"
  #include "xattr.h"
  #include "acl.h"
+#include <trace/events/f2fs.h>
  
  static int f2fs_vm_page_mkwrite(struct vm_area_struct *vma,
                                                 struct vm_fault *vmf)
@@ -33,19 +35,18 @@ static int f2fs_vm_page_mkwrite(struct vm_area_struct *vma,
         struct f2fs_sb_info *sbi = F2FS_SB(inode->i_sb);
         block_t old_blk_addr;
         struct dnode_of_data dn;
-       int err;
+       int err, ilock;
  
         f2fs_balance_fs(sbi);
  
         sb_start_pagefault(inode->i_sb);
  
-       mutex_lock_op(sbi, DATA_NEW);
-
         /* block allocation */
+       ilock = mutex_lock_op(sbi);
         set_new_dnode(&dn, inode, NULL, NULL, 0);
-       err = get_dnode_of_data(&dn, page->index, 0);
+       err = get_dnode_of_data(&dn, page->index, ALLOC_NODE);
         if (err) {
-               mutex_unlock_op(sbi, DATA_NEW);
+               mutex_unlock_op(sbi, ilock);
                 goto out;
         }
  
@@ -55,13 +56,12 @@ static int f2fs_vm_page_mkwrite(struct vm_area_struct *vma,
                 err = reserve_new_block(&dn);
                 if (err) {
                         f2fs_put_dnode(&dn);
-                       mutex_unlock_op(sbi, DATA_NEW);
+                       mutex_unlock_op(sbi, ilock);
                         goto out;
                 }
         }
         f2fs_put_dnode(&dn);
-
-       mutex_unlock_op(sbi, DATA_NEW);
+       mutex_unlock_op(sbi, ilock);
  
         lock_page(page);
         if (page->mapping != inode->i_mapping ||
@@ -102,28 +102,10 @@ static const struct vm_operations_struct f2fs_file_vm_ops = {
         .remap_pages    = generic_file_remap_pages,
  };
  
-static int need_to_sync_dir(struct f2fs_sb_info *sbi, struct inode *inode)
-{
-       struct dentry *dentry;
-       nid_t pino;
-
-       inode = igrab(inode);
-       dentry = d_find_any_alias(inode);
-       if (!dentry) {
-               iput(inode);
-               return 0;
-       }
-       pino = dentry->d_parent->d_inode->i_ino;
-       dput(dentry);
-       iput(inode);
-       return !is_checkpointed_node(sbi, pino);
-}
-
  int f2fs_sync_file(struct file *file, loff_t start, loff_t end, int datasync)
  {
         struct inode *inode = file->f_mapping->host;
         struct f2fs_sb_info *sbi = F2FS_SB(inode->i_sb);
-       unsigned long long cur_version;
         int ret = 0;
         bool need_cp = false;
         struct writeback_control wbc = {
@@ -135,9 +117,12 @@ int f2fs_sync_file(struct file *file, loff_t start, loff_t end, int datasync)
         if (inode->i_sb->s_flags & MS_RDONLY)
                 return 0;
  
+       trace_f2fs_sync_file_enter(inode);
         ret = filemap_write_and_wait_range(inode->i_mapping, start, end);
-       if (ret)
+       if (ret) {
+               trace_f2fs_sync_file_exit(inode, need_cp, datasync, ret);
                 return ret;
+       }
  
         /* guarantee free sections for fsync */
         f2fs_balance_fs(sbi);
@@ -147,28 +132,18 @@ int f2fs_sync_file(struct file *file, loff_t start, loff_t end, int datasync)
         if (datasync && !(inode->i_state & I_DIRTY_DATASYNC))
                 goto out;
  
-       mutex_lock(&sbi->cp_mutex);
-       cur_version = le64_to_cpu(F2FS_CKPT(sbi)->checkpoint_ver);
-       mutex_unlock(&sbi->cp_mutex);
-
-       if (F2FS_I(inode)->data_version != cur_version &&
-                                       !(inode->i_state & I_DIRTY))
-               goto out;
-       F2FS_I(inode)->data_version--;
-
         if (!S_ISREG(inode->i_mode) || inode->i_nlink != 1)
                 need_cp = true;
-       else if (is_inode_flag_set(F2FS_I(inode), FI_NEED_CP))
+       else if (is_cp_file(inode))
                 need_cp = true;
         else if (!space_for_roll_forward(sbi))
                 need_cp = true;
-       else if (need_to_sync_dir(sbi, inode))
+       else if (!is_checkpointed_node(sbi, F2FS_I(inode)->i_pino))
                 need_cp = true;
  
         if (need_cp) {
                 /* all the dirty node pages should be flushed for POR */
                 ret = f2fs_sync_fs(inode->i_sb, 1);
-               clear_inode_flag(F2FS_I(inode), FI_NEED_CP);
         } else {
                 /* if there is no written node page, write its inode page */
                 while (!sync_node_pages(sbi, inode->i_ino, &wbc)) {
@@ -178,9 +153,11 @@ int f2fs_sync_file(struct file *file, loff_t start, loff_t end, int datasync)
                 }
                 filemap_fdatawait_range(sbi->node_inode->i_mapping,
                                                         0, LONG_MAX);
+               ret = blkdev_issue_flush(inode->i_sb->s_bdev, GFP_KERNEL, NULL);
         }
  out:
         mutex_unlock(&inode->i_mutex);
+       trace_f2fs_sync_file_exit(inode, need_cp, datasync, ret);
         return ret;
  }
  
@@ -216,6 +193,9 @@ static int truncate_data_blocks_range(struct dnode_of_data *dn, int count)
                 sync_inode_page(dn);
         }
         dn->ofs_in_node = ofs;
+
+       trace_f2fs_truncate_data_blocks_range(dn->inode, dn->nid,
+                                        dn->ofs_in_node, nr_free);
         return nr_free;
  }
  
@@ -232,11 +212,15 @@ static void truncate_partial_data_page(struct inode *inode, u64 from)
         if (!offset)
                 return;
  
-       page = find_data_page(inode, from >> PAGE_CACHE_SHIFT);
+       page = find_data_page(inode, from >> PAGE_CACHE_SHIFT, false);
         if (IS_ERR(page))
                 return;
  
         lock_page(page);
+       if (page->mapping != inode->i_mapping) {
+               f2fs_put_page(page, 1);
+               return;
+       }
         wait_on_page_writeback(page);
         zero_user(page, offset, PAGE_CACHE_SIZE - offset);
         set_page_dirty(page);
@@ -249,20 +233,22 @@ static int truncate_blocks(struct inode *inode, u64 from)
         unsigned int blocksize = inode->i_sb->s_blocksize;
         struct dnode_of_data dn;
         pgoff_t free_from;
-       int count = 0;
+       int count = 0, ilock = -1;
         int err;
  
+       trace_f2fs_truncate_blocks_enter(inode, from);
+
         free_from = (pgoff_t)
                         ((from + blocksize - 1) >> (sbi->log_blocksize));
  
-       mutex_lock_op(sbi, DATA_TRUNC);
-
+       ilock = mutex_lock_op(sbi);
         set_new_dnode(&dn, inode, NULL, NULL, 0);
-       err = get_dnode_of_data(&dn, free_from, RDONLY_NODE);
+       err = get_dnode_of_data(&dn, free_from, LOOKUP_NODE);
         if (err) {
                 if (err == -ENOENT)
                         goto free_next;
-               mutex_unlock_op(sbi, DATA_TRUNC);
+               mutex_unlock_op(sbi, ilock);
+               trace_f2fs_truncate_blocks_exit(inode, err);
                 return err;
         }
  
@@ -273,6 +259,7 @@ static int truncate_blocks(struct inode *inode, u64 from)
  
         count -= dn.ofs_in_node;
         BUG_ON(count < 0);
+
         if (dn.ofs_in_node || IS_INODE(dn.node_page)) {
                 truncate_data_blocks_range(&dn, count);
                 free_from += count;
@@ -281,11 +268,12 @@ static int truncate_blocks(struct inode *inode, u64 from)
         f2fs_put_dnode(&dn);
  free_next:
         err = truncate_inode_blocks(inode, free_from);
-       mutex_unlock_op(sbi, DATA_TRUNC);
+       mutex_unlock_op(sbi, ilock);
  
         /* lastly zero out the first data page */
         truncate_partial_data_page(inode, from);
  
+       trace_f2fs_truncate_blocks_exit(inode, err);
         return err;
  }
  
@@ -295,6 +283,8 @@ void f2fs_truncate(struct inode *inode)
                                 S_ISLNK(inode->i_mode)))
                 return;
  
+       trace_f2fs_truncate(inode);
+
         if (!truncate_blocks(inode, i_size_read(inode))) {
                 inode->i_mtime = inode->i_ctime = CURRENT_TIME;
                 mark_inode_dirty(inode);
@@ -389,15 +379,16 @@ static void fill_zero(struct inode *inode, pgoff_t index,
  {
         struct f2fs_sb_info *sbi = F2FS_SB(inode->i_sb);
         struct page *page;
+       int ilock;
  
         if (!len)
                 return;
  
         f2fs_balance_fs(sbi);
  
-       mutex_lock_op(sbi, DATA_NEW);
+       ilock = mutex_lock_op(sbi);
         page = get_new_data_page(inode, index, false);
-       mutex_unlock_op(sbi, DATA_NEW);
+       mutex_unlock_op(sbi, ilock);
  
         if (!IS_ERR(page)) {
                 wait_on_page_writeback(page);
@@ -414,15 +405,10 @@ int truncate_hole(struct inode *inode, pgoff_t pg_start, pgoff_t pg_end)
  
         for (index = pg_start; index < pg_end; index++) {
                 struct dnode_of_data dn;
-               struct f2fs_sb_info *sbi = F2FS_SB(inode->i_sb);
-
-               f2fs_balance_fs(sbi);
  
-               mutex_lock_op(sbi, DATA_TRUNC);
                 set_new_dnode(&dn, inode, NULL, NULL, 0);
-               err = get_dnode_of_data(&dn, index, RDONLY_NODE);
+               err = get_dnode_of_data(&dn, index, LOOKUP_NODE);
                 if (err) {
-                       mutex_unlock_op(sbi, DATA_TRUNC);
                         if (err == -ENOENT)
                                 continue;
                         return err;
@@ -431,7 +417,6 @@ int truncate_hole(struct inode *inode, pgoff_t pg_start, pgoff_t pg_end)
                 if (dn.data_blkaddr != NULL_ADDR)
                         truncate_data_blocks_range(&dn, 1);
                 f2fs_put_dnode(&dn);
-               mutex_unlock_op(sbi, DATA_TRUNC);
         }
         return 0;
  }
@@ -461,12 +446,19 @@ static int punch_hole(struct inode *inode, loff_t offset, loff_t len, int mode)
                 if (pg_start < pg_end) {
                         struct address_space *mapping = inode->i_mapping;
                         loff_t blk_start, blk_end;
+                       struct f2fs_sb_info *sbi = F2FS_SB(inode->i_sb);
+                       int ilock;
+
+                       f2fs_balance_fs(sbi);
  
                         blk_start = pg_start << PAGE_CACHE_SHIFT;
                         blk_end = pg_end << PAGE_CACHE_SHIFT;
                         truncate_inode_pages_range(mapping, blk_start,
                                         blk_end - 1);
+
+                       ilock = mutex_lock_op(sbi);
                         ret = truncate_hole(inode, pg_start, pg_end);
+                       mutex_unlock_op(sbi, ilock);
                 }
         }
  
@@ -500,13 +492,13 @@ static int expand_inode_data(struct inode *inode, loff_t offset,
  
         for (index = pg_start; index <= pg_end; index++) {
                 struct dnode_of_data dn;
+               int ilock;
  
-               mutex_lock_op(sbi, DATA_NEW);
-
+               ilock = mutex_lock_op(sbi);
                 set_new_dnode(&dn, inode, NULL, NULL, 0);
-               ret = get_dnode_of_data(&dn, index, 0);
+               ret = get_dnode_of_data(&dn, index, ALLOC_NODE);
                 if (ret) {
-                       mutex_unlock_op(sbi, DATA_NEW);
+                       mutex_unlock_op(sbi, ilock);
                         break;
                 }
  
@@ -514,13 +506,12 @@ static int expand_inode_data(struct inode *inode, loff_t offset,
                         ret = reserve_new_block(&dn);
                         if (ret) {
                                 f2fs_put_dnode(&dn);
-                               mutex_unlock_op(sbi, DATA_NEW);
+                               mutex_unlock_op(sbi, ilock);
                                 break;
                         }
                 }
                 f2fs_put_dnode(&dn);
-
-               mutex_unlock_op(sbi, DATA_NEW);
+               mutex_unlock_op(sbi, ilock);
  
                 if (pg_start == pg_end)
                         new_size = offset + len;
@@ -559,6 +550,7 @@ static long f2fs_fallocate(struct file *file, int mode,
                 inode->i_mtime = inode->i_ctime = CURRENT_TIME;
                 mark_inode_dirty(inode);
         }
+       trace_f2fs_fallocate(inode, mode, offset, len, ret);
         return ret;
  }
  
diff --git a/fs/f2fs/gc.c b/fs/f2fs/gc.c

index 2e3eb2d4fc30eebcd40c1b34283302461dfea3f3..14961593e93c84f47860e0a33df001f59a8d9508 100644 (file)
--- a/fs/f2fs/gc.c
+++ b/fs/f2fs/gc.c
@@ -11,7 +11,6 @@
  #include <linux/fs.h>
  #include <linux/module.h>
  #include <linux/backing-dev.h>
-#include <linux/proc_fs.h>
  #include <linux/init.h>
  #include <linux/f2fs_fs.h>
  #include <linux/kthread.h>
@@ -23,6 +22,7 @@
  #include "node.h"
  #include "segment.h"
  #include "gc.h"
+#include <trace/events/f2fs.h>
  
  static struct kmem_cache *winode_slab;
  
@@ -81,9 +81,6 @@ static int gc_thread_func(void *data)
                 /* if return value is not zero, no victim was selected */
                 if (f2fs_gc(sbi))
                         wait_ms = GC_THREAD_NOGC_SLEEP_TIME;
-               else if (wait_ms == GC_THREAD_NOGC_SLEEP_TIME)
-                       wait_ms = GC_THREAD_MAX_SLEEP_TIME;
-
         } while (!kthread_should_stop());
         return 0;
  }
@@ -131,7 +128,7 @@ static void select_policy(struct f2fs_sb_info *sbi, int gc_type,
  {
         struct dirty_seglist_info *dirty_i = DIRTY_I(sbi);
  
-       if (p->alloc_mode) {
+       if (p->alloc_mode == SSR) {
                 p->gc_mode = GC_GREEDY;
                 p->dirty_segmap = dirty_i->dirty_segmap[type];
                 p->ofs_unit = 1;
@@ -160,18 +157,21 @@ static unsigned int get_max_cost(struct f2fs_sb_info *sbi,
  static unsigned int check_bg_victims(struct f2fs_sb_info *sbi)
  {
         struct dirty_seglist_info *dirty_i = DIRTY_I(sbi);
-       unsigned int segno;
+       unsigned int hint = 0;
+       unsigned int secno;
  
         /*
          * If the gc_type is FG_GC, we can select victim segments
          * selected by background GC before.
          * Those segments guarantee they have small valid blocks.
          */
-       segno = find_next_bit(dirty_i->victim_segmap[BG_GC],
-                                               TOTAL_SEGS(sbi), 0);
-       if (segno < TOTAL_SEGS(sbi)) {
-               clear_bit(segno, dirty_i->victim_segmap[BG_GC]);
-               return segno;
+next:
+       secno = find_next_bit(dirty_i->victim_secmap, TOTAL_SECS(sbi), hint++);
+       if (secno < TOTAL_SECS(sbi)) {
+               if (sec_usage_check(sbi, secno))
+                       goto next;
+               clear_bit(secno, dirty_i->victim_secmap);
+               return secno * sbi->segs_per_sec;
         }
         return NULL_SEGNO;
  }
@@ -234,7 +234,7 @@ static int get_victim_by_default(struct f2fs_sb_info *sbi,
  {
         struct dirty_seglist_info *dirty_i = DIRTY_I(sbi);
         struct victim_sel_policy p;
-       unsigned int segno;
+       unsigned int secno;
         int nsearched = 0;
  
         p.alloc_mode = alloc_mode;
@@ -253,6 +253,7 @@ static int get_victim_by_default(struct f2fs_sb_info *sbi,
  
         while (1) {
                 unsigned long cost;
+               unsigned int segno;
  
                 segno = find_next_bit(p.dirty_segmap,
                                                 TOTAL_SEGS(sbi), p.offset);
@@ -265,13 +266,11 @@ static int get_victim_by_default(struct f2fs_sb_info *sbi,
                         break;
                 }
                 p.offset = ((segno / p.ofs_unit) * p.ofs_unit) + p.ofs_unit;
+               secno = GET_SECNO(sbi, segno);
  
-               if (test_bit(segno, dirty_i->victim_segmap[FG_GC]))
-                       continue;
-               if (gc_type == BG_GC &&
-                               test_bit(segno, dirty_i->victim_segmap[BG_GC]))
+               if (sec_usage_check(sbi, secno))
                         continue;
-               if (IS_CURSEC(sbi, GET_SECNO(sbi, segno)))
+               if (gc_type == BG_GC && test_bit(secno, dirty_i->victim_secmap))
                         continue;
  
                 cost = get_gc_cost(sbi, segno, &p);
@@ -291,13 +290,18 @@ static int get_victim_by_default(struct f2fs_sb_info *sbi,
         }
  got_it:
         if (p.min_segno != NULL_SEGNO) {
-               *result = (p.min_segno / p.ofs_unit) * p.ofs_unit;
                 if (p.alloc_mode == LFS) {
-                       int i;
-                       for (i = 0; i < p.ofs_unit; i++)
-                               set_bit(*result + i,
-                                       dirty_i->victim_segmap[gc_type]);
+                       secno = GET_SECNO(sbi, p.min_segno);
+                       if (gc_type == FG_GC)
+                               sbi->cur_victim_sec = secno;
+                       else
+                               set_bit(secno, dirty_i->victim_secmap);
                 }
+               *result = (p.min_segno / p.ofs_unit) * p.ofs_unit;
+
+               trace_f2fs_get_victim(sbi->sb, type, gc_type, &p,
+                               sbi->cur_victim_sec,
+                               prefree_segments(sbi), free_segments(sbi));
         }
         mutex_unlock(&dirty_i->seglist_lock);
  
@@ -381,6 +385,7 @@ static void gc_node_segment(struct f2fs_sb_info *sbi,
  
  next_step:
         entry = sum;
+
         for (off = 0; off < sbi->blocks_per_seg; off++, entry++) {
                 nid_t nid = le32_to_cpu(entry->nid);
                 struct page *node_page;
@@ -401,11 +406,18 @@ next_step:
                         continue;
  
                 /* set page dirty and write it */
-               if (!PageWriteback(node_page))
+               if (gc_type == FG_GC) {
+                       f2fs_submit_bio(sbi, NODE, true);
+                       wait_on_page_writeback(node_page);
                         set_page_dirty(node_page);
+               } else {
+                       if (!PageWriteback(node_page))
+                               set_page_dirty(node_page);
+               }
                 f2fs_put_page(node_page, 1);
                 stat_inc_node_blk_count(sbi, 1);
         }
+
         if (initial) {
                 initial = false;
                 goto next_step;
@@ -418,6 +430,13 @@ next_step:
                         .for_reclaim = 0,
                 };
                 sync_node_pages(sbi, 0, &wbc);
+
+               /*
+                * In the case of FG_GC, it'd be better to reclaim this victim
+                * completely.
+                */
+               if (get_valid_blocks(sbi, segno, 1) != 0)
+                       goto next_step;
         }
  }
  
@@ -481,21 +500,19 @@ static int check_dnode(struct f2fs_sb_info *sbi, struct f2fs_summary *sum,
  
  static void move_data_page(struct inode *inode, struct page *page, int gc_type)
  {
-       if (page->mapping != inode->i_mapping)
-               goto out;
-
-       if (inode != page->mapping->host)
-               goto out;
-
-       if (PageWriteback(page))
-               goto out;
-
         if (gc_type == BG_GC) {
+               if (PageWriteback(page))
+                       goto out;
                 set_page_dirty(page);
                 set_cold_data(page);
         } else {
                 struct f2fs_sb_info *sbi = F2FS_SB(inode->i_sb);
-               mutex_lock_op(sbi, DATA_WRITE);
+
+               if (PageWriteback(page)) {
+                       f2fs_submit_bio(sbi, DATA, true);
+                       wait_on_page_writeback(page);
+               }
+
                 if (clear_page_dirty_for_io(page) &&
                         S_ISDIR(inode->i_mode)) {
                         dec_page_count(sbi, F2FS_DIRTY_DENTS);
@@ -503,7 +520,6 @@ static void move_data_page(struct inode *inode, struct page *page, int gc_type)
                 }
                 set_cold_data(page);
                 do_write_data_page(page);
-               mutex_unlock_op(sbi, DATA_WRITE);
                 clear_cold_data(page);
         }
  out:
@@ -530,6 +546,7 @@ static void gc_data_segment(struct f2fs_sb_info *sbi, struct f2fs_summary *sum,
  
  next_step:
         entry = sum;
+
         for (off = 0; off < sbi->blocks_per_seg; off++, entry++) {
                 struct page *data_page;
                 struct inode *inode;
@@ -567,7 +584,7 @@ next_step:
                                 continue;
  
                         data_page = find_data_page(inode,
-                                       start_bidx + ofs_in_node);
+                                       start_bidx + ofs_in_node, false);
                         if (IS_ERR(data_page))
                                 goto next_iput;
  
@@ -588,11 +605,22 @@ next_step:
  next_iput:
                 iput(inode);
         }
+
         if (++phase < 4)
                 goto next_step;
  
-       if (gc_type == FG_GC)
+       if (gc_type == FG_GC) {
                 f2fs_submit_bio(sbi, DATA, true);
+
+               /*
+                * In the case of FG_GC, it'd be better to reclaim this victim
+                * completely.
+                */
+               if (get_valid_blocks(sbi, segno, 1) != 0) {
+                       phase = 2;
+                       goto next_step;
+               }
+       }
  }
  
  static int __get_victim(struct f2fs_sb_info *sbi, unsigned int *victim,
@@ -611,18 +639,15 @@ static void do_garbage_collect(struct f2fs_sb_info *sbi, unsigned int segno,
  {
         struct page *sum_page;
         struct f2fs_summary_block *sum;
+       struct blk_plug plug;
  
         /* read segment summary of victim */
         sum_page = get_sum_page(sbi, segno);
         if (IS_ERR(sum_page))
                 return;
  
-       /*
-        * CP needs to lock sum_page. In this time, we don't need
-        * to lock this page, because this summary page is not gone anywhere.
-        * Also, this page is not gonna be updated before GC is done.
-        */
-       unlock_page(sum_page);
+       blk_start_plug(&plug);
+
         sum = page_address(sum_page);
  
         switch (GET_SUM_TYPE((&sum->footer))) {
@@ -633,10 +658,12 @@ static void do_garbage_collect(struct f2fs_sb_info *sbi, unsigned int segno,
                 gc_data_segment(sbi, sum->entries, ilist, segno, gc_type);
                 break;
         }
+       blk_finish_plug(&plug);
+
         stat_inc_seg_count(sbi, GET_SUM_TYPE((&sum->footer)));
         stat_inc_call_count(sbi->stat_info);
  
-       f2fs_put_page(sum_page, 0);
+       f2fs_put_page(sum_page, 1);
  }
  
  int f2fs_gc(struct f2fs_sb_info *sbi)
@@ -652,8 +679,10 @@ gc_more:
         if (!(sbi->sb->s_flags & MS_ACTIVE))
                 goto stop;
  
-       if (gc_type == BG_GC && has_not_enough_free_secs(sbi, nfree))
+       if (gc_type == BG_GC && has_not_enough_free_secs(sbi, nfree)) {
                 gc_type = FG_GC;
+               write_checkpoint(sbi, false);
+       }
  
         if (!__get_victim(sbi, &segno, gc_type, NO_CHECK_TYPE))
                 goto stop;
@@ -662,9 +691,11 @@ gc_more:
         for (i = 0; i < sbi->segs_per_sec; i++)
                 do_garbage_collect(sbi, segno + i, &ilist, gc_type);
  
-       if (gc_type == FG_GC &&
-                       get_valid_blocks(sbi, segno, sbi->segs_per_sec) == 0)
+       if (gc_type == FG_GC) {
+               sbi->cur_victim_sec = NULL_SEGNO;
                 nfree++;
+               WARN_ON(get_valid_blocks(sbi, segno, sbi->segs_per_sec));
+       }
  
         if (has_not_enough_free_secs(sbi, nfree))
                 goto gc_more;
diff --git a/fs/f2fs/gc.h b/fs/f2fs/gc.h

index 30b2db003acd46c4ef5c637bfc29a37f5217cf5d..2c6a6bd0832244f4bb1e45b52cd41400e87fe363 100644 (file)
--- a/fs/f2fs/gc.h
+++ b/fs/f2fs/gc.h
@@ -13,9 +13,9 @@
                                                  * whether IO subsystem is idle
                                                  * or not
                                                  */
-#define GC_THREAD_MIN_SLEEP_TIME       10000 /* milliseconds */
-#define GC_THREAD_MAX_SLEEP_TIME       30000
-#define GC_THREAD_NOGC_SLEEP_TIME      10000
+#define GC_THREAD_MIN_SLEEP_TIME       30000   /* milliseconds */
+#define GC_THREAD_MAX_SLEEP_TIME       60000
+#define GC_THREAD_NOGC_SLEEP_TIME      300000  /* wait 5 min */
  #define LIMIT_INVALID_BLOCK    40 /* percentage over total user space */
  #define LIMIT_FREE_BLOCK       40 /* percentage over invalid + free space */
  
@@ -58,6 +58,9 @@ static inline block_t limit_free_user_blocks(struct f2fs_sb_info *sbi)
  
  static inline long increase_sleep_time(long wait)
  {
+       if (wait == GC_THREAD_NOGC_SLEEP_TIME)
+               return wait;
+
         wait += GC_THREAD_MIN_SLEEP_TIME;
         if (wait > GC_THREAD_MAX_SLEEP_TIME)
                 wait = GC_THREAD_MAX_SLEEP_TIME;
@@ -66,6 +69,9 @@ static inline long increase_sleep_time(long wait)
  
  static inline long decrease_sleep_time(long wait)
  {
+       if (wait == GC_THREAD_NOGC_SLEEP_TIME)
+               wait = GC_THREAD_MAX_SLEEP_TIME;
+
         wait -= GC_THREAD_MIN_SLEEP_TIME;
         if (wait <= GC_THREAD_MIN_SLEEP_TIME)
                 wait = GC_THREAD_MIN_SLEEP_TIME;
diff --git a/fs/f2fs/inode.c b/fs/f2fs/inode.c

index ddae412d30c8519d7b11c4fe396f548b3a7b713a..91ac7f9d88eeaf84a86f38bb07866095910d8620 100644 (file)
--- a/fs/f2fs/inode.c
+++ b/fs/f2fs/inode.c
@@ -16,6 +16,8 @@
  #include "f2fs.h"
  #include "node.h"
  
+#include <trace/events/f2fs.h>
+
  void f2fs_set_inode_flags(struct inode *inode)
  {
         unsigned int flags = F2FS_I(inode)->i_flags;
@@ -44,7 +46,11 @@ static int do_read_inode(struct inode *inode)
         struct f2fs_inode *ri;
  
         /* Check if ino is within scope */
-       check_nid_range(sbi, inode->i_ino);
+       if (check_nid_range(sbi, inode->i_ino)) {
+               f2fs_msg(inode->i_sb, KERN_ERR, "bad inode number: %lu",
+                        (unsigned long) inode->i_ino);
+               return -EINVAL;
+       }
  
         node_page = get_node_page(sbi, inode->i_ino);
         if (IS_ERR(node_page))
@@ -76,7 +82,6 @@ static int do_read_inode(struct inode *inode)
         fi->i_xattr_nid = le32_to_cpu(ri->i_xattr_nid);
         fi->i_flags = le32_to_cpu(ri->i_flags);
         fi->flags = 0;
-       fi->data_version = le64_to_cpu(F2FS_CKPT(sbi)->checkpoint_ver) - 1;
         fi->i_advise = ri->i_advise;
         fi->i_pino = le32_to_cpu(ri->i_pino);
         get_extent_info(&fi->ext, ri->i_ext);
@@ -88,13 +93,16 @@ struct inode *f2fs_iget(struct super_block *sb, unsigned long ino)
  {
         struct f2fs_sb_info *sbi = F2FS_SB(sb);
         struct inode *inode;
-       int ret;
+       int ret = 0;
  
         inode = iget_locked(sb, ino);
         if (!inode)
                 return ERR_PTR(-ENOMEM);
-       if (!(inode->i_state & I_NEW))
+
+       if (!(inode->i_state & I_NEW)) {
+               trace_f2fs_iget(inode);
                 return inode;
+       }
         if (ino == F2FS_NODE_INO(sbi) || ino == F2FS_META_INO(sbi))
                 goto make_now;
  
@@ -136,11 +144,12 @@ make_now:
                 goto bad_inode;
         }
         unlock_new_inode(inode);
-
+       trace_f2fs_iget(inode);
         return inode;
  
  bad_inode:
         iget_failed(inode);
+       trace_f2fs_iget_exit(inode, ret);
         return ERR_PTR(ret);
  }
  
@@ -192,47 +201,51 @@ void update_inode(struct inode *inode, struct page *node_page)
         set_page_dirty(node_page);
  }
  
-int f2fs_write_inode(struct inode *inode, struct writeback_control *wbc)
+int update_inode_page(struct inode *inode)
  {
         struct f2fs_sb_info *sbi = F2FS_SB(inode->i_sb);
         struct page *node_page;
-       bool need_lock = false;
-
-       if (inode->i_ino == F2FS_NODE_INO(sbi) ||
-                       inode->i_ino == F2FS_META_INO(sbi))
-               return 0;
-
-       if (wbc)
-               f2fs_balance_fs(sbi);
  
         node_page = get_node_page(sbi, inode->i_ino);
         if (IS_ERR(node_page))
                 return PTR_ERR(node_page);
  
-       if (!PageDirty(node_page)) {
-               need_lock = true;
-               f2fs_put_page(node_page, 1);
-               mutex_lock(&sbi->write_inode);
-               node_page = get_node_page(sbi, inode->i_ino);
-               if (IS_ERR(node_page)) {
-                       mutex_unlock(&sbi->write_inode);
-                       return PTR_ERR(node_page);
-               }
-       }
         update_inode(inode, node_page);
         f2fs_put_page(node_page, 1);
-       if (need_lock)
-               mutex_unlock(&sbi->write_inode);
         return 0;
  }
  
+int f2fs_write_inode(struct inode *inode, struct writeback_control *wbc)
+{
+       struct f2fs_sb_info *sbi = F2FS_SB(inode->i_sb);
+       int ret, ilock;
+
+       if (inode->i_ino == F2FS_NODE_INO(sbi) ||
+                       inode->i_ino == F2FS_META_INO(sbi))
+               return 0;
+
+       if (wbc)
+               f2fs_balance_fs(sbi);
+
+       /*
+        * We need to lock here to prevent from producing dirty node pages
+        * during the urgent cleaning time when runing out of free sections.
+        */
+       ilock = mutex_lock_op(sbi);
+       ret = update_inode_page(inode);
+       mutex_unlock_op(sbi, ilock);
+       return ret;
+}
+
  /*
   * Called at the last iput() if i_nlink is zero
   */
  void f2fs_evict_inode(struct inode *inode)
  {
         struct f2fs_sb_info *sbi = F2FS_SB(inode->i_sb);
+       int ilock;
  
+       trace_f2fs_evict_inode(inode);
         truncate_inode_pages(&inode->i_data, 0);
  
         if (inode->i_ino == F2FS_NODE_INO(sbi) ||
@@ -252,7 +265,10 @@ void f2fs_evict_inode(struct inode *inode)
         if (F2FS_HAS_BLOCKS(inode))
                 f2fs_truncate(inode);
  
+       ilock = mutex_lock_op(sbi);
         remove_inode_page(inode);
+       mutex_unlock_op(sbi, ilock);
+
         sb_end_intwrite(inode->i_sb);
  no_delete:
         clear_inode(inode);
diff --git a/fs/f2fs/namei.c b/fs/f2fs/namei.c

index 1a49b881bac021cefa53df578108a2a6972c3961..47abc9722b17abfae9656b3d5d52360fc8e72ad8 100644 (file)
--- a/fs/f2fs/namei.c
+++ b/fs/f2fs/namei.c
@@ -15,8 +15,10 @@
  #include <linux/ctype.h>
  
  #include "f2fs.h"
+#include "node.h"
  #include "xattr.h"
  #include "acl.h"
+#include <trace/events/f2fs.h>
  
  static struct inode *f2fs_new_inode(struct inode *dir, umode_t mode)
  {
@@ -25,19 +27,19 @@ static struct inode *f2fs_new_inode(struct inode *dir, umode_t mode)
         nid_t ino;
         struct inode *inode;
         bool nid_free = false;
-       int err;
+       int err, ilock;
  
         inode = new_inode(sb);
         if (!inode)
                 return ERR_PTR(-ENOMEM);
  
-       mutex_lock_op(sbi, NODE_NEW);
+       ilock = mutex_lock_op(sbi);
         if (!alloc_nid(sbi, &ino)) {
-               mutex_unlock_op(sbi, NODE_NEW);
+               mutex_unlock_op(sbi, ilock);
                 err = -ENOSPC;
                 goto fail;
         }
-       mutex_unlock_op(sbi, NODE_NEW);
+       mutex_unlock_op(sbi, ilock);
  
         inode->i_uid = current_fsuid();
  
@@ -61,7 +63,7 @@ static struct inode *f2fs_new_inode(struct inode *dir, umode_t mode)
                 nid_free = true;
                 goto out;
         }
-
+       trace_f2fs_new_inode(inode, 0);
         mark_inode_dirty(inode);
         return inode;
  
@@ -69,6 +71,8 @@ out:
         clear_nlink(inode);
         unlock_new_inode(inode);
  fail:
+       trace_f2fs_new_inode(inode, err);
+       make_bad_inode(inode);
         iput(inode);
         if (nid_free)
                 alloc_nid_failed(sbi, ino);
@@ -82,7 +86,7 @@ static int is_multimedia_file(const unsigned char *s, const char *sub)
         int ret;
  
         if (sublen > slen)
-               return 1;
+               return 0;
  
         ret = memcmp(s + slen - sublen, sub, sublen);
         if (ret) {      /* compare upper case */
@@ -90,16 +94,16 @@ static int is_multimedia_file(const unsigned char *s, const char *sub)
                 char upper_sub[8];
                 for (i = 0; i < sublen && i < sizeof(upper_sub); i++)
                         upper_sub[i] = toupper(sub[i]);
-               return memcmp(s + slen - sublen, upper_sub, sublen);
+               return !memcmp(s + slen - sublen, upper_sub, sublen);
         }
  
-       return ret;
+       return !ret;
  }
  
  /*
   * Set multimedia files as cold files for hot/cold data separation
   */
-static inline void set_cold_file(struct f2fs_sb_info *sbi, struct inode *inode,
+static inline void set_cold_files(struct f2fs_sb_info *sbi, struct inode *inode,
                 const unsigned char *name)
  {
         int i;
@@ -107,8 +111,8 @@ static inline void set_cold_file(struct f2fs_sb_info *sbi, struct inode *inode,
  
         int count = le32_to_cpu(sbi->raw_super->extension_count);
         for (i = 0; i < count; i++) {
-               if (!is_multimedia_file(name, extlist[i])) {
-                       F2FS_I(inode)->i_advise |= FADVISE_COLD_BIT;
+               if (is_multimedia_file(name, extlist[i])) {
+                       set_cold_file(inode);
                         break;
                 }
         }
@@ -121,7 +125,7 @@ static int f2fs_create(struct inode *dir, struct dentry *dentry, umode_t mode,
         struct f2fs_sb_info *sbi = F2FS_SB(sb);
         struct inode *inode;
         nid_t ino = 0;
-       int err;
+       int err, ilock;
  
         f2fs_balance_fs(sbi);
  
@@ -130,14 +134,16 @@ static int f2fs_create(struct inode *dir, struct dentry *dentry, umode_t mode,
                 return PTR_ERR(inode);
  
         if (!test_opt(sbi, DISABLE_EXT_IDENTIFY))
-               set_cold_file(sbi, inode, dentry->d_name.name);
+               set_cold_files(sbi, inode, dentry->d_name.name);
  
         inode->i_op = &f2fs_file_inode_operations;
         inode->i_fop = &f2fs_file_operations;
         inode->i_mapping->a_ops = &f2fs_dblock_aops;
         ino = inode->i_ino;
  
+       ilock = mutex_lock_op(sbi);
         err = f2fs_add_link(dentry, inode);
+       mutex_unlock_op(sbi, ilock);
         if (err)
                 goto out;
  
@@ -150,6 +156,7 @@ static int f2fs_create(struct inode *dir, struct dentry *dentry, umode_t mode,
  out:
         clear_nlink(inode);
         unlock_new_inode(inode);
+       make_bad_inode(inode);
         iput(inode);
         alloc_nid_failed(sbi, ino);
         return err;
@@ -161,7 +168,7 @@ static int f2fs_link(struct dentry *old_dentry, struct inode *dir,
         struct inode *inode = old_dentry->d_inode;
         struct super_block *sb = dir->i_sb;
         struct f2fs_sb_info *sbi = F2FS_SB(sb);
-       int err;
+       int err, ilock;
  
         f2fs_balance_fs(sbi);
  
@@ -169,14 +176,23 @@ static int f2fs_link(struct dentry *old_dentry, struct inode *dir,
         atomic_inc(&inode->i_count);
  
         set_inode_flag(F2FS_I(inode), FI_INC_LINK);
+       ilock = mutex_lock_op(sbi);
         err = f2fs_add_link(dentry, inode);
+       mutex_unlock_op(sbi, ilock);
         if (err)
                 goto out;
  
+       /*
+        * This file should be checkpointed during fsync.
+        * We lost i_pino from now on.
+        */
+       set_cp_file(inode);
+
         d_instantiate(dentry, inode);
         return 0;
  out:
         clear_inode_flag(F2FS_I(inode), FI_INC_LINK);
+       make_bad_inode(inode);
         iput(inode);
         return err;
  }
@@ -197,7 +213,7 @@ static struct dentry *f2fs_lookup(struct inode *dir, struct dentry *dentry,
         struct f2fs_dir_entry *de;
         struct page *page;
  
-       if (dentry->d_name.len > F2FS_MAX_NAME_LEN)
+       if (dentry->d_name.len > F2FS_NAME_LEN)
                 return ERR_PTR(-ENAMETOOLONG);
  
         de = f2fs_find_entry(dir, &dentry->d_name, &page);
@@ -222,7 +238,9 @@ static int f2fs_unlink(struct inode *dir, struct dentry *dentry)
         struct f2fs_dir_entry *de;
         struct page *page;
         int err = -ENOENT;
+       int ilock;
  
+       trace_f2fs_unlink_enter(dir, dentry);
         f2fs_balance_fs(sbi);
  
         de = f2fs_find_entry(dir, &dentry->d_name, &page);
@@ -236,11 +254,14 @@ static int f2fs_unlink(struct inode *dir, struct dentry *dentry)
                 goto fail;
         }
  
+       ilock = mutex_lock_op(sbi);
         f2fs_delete_entry(de, page, inode);
+       mutex_unlock_op(sbi, ilock);
  
         /* In order to evict this inode,  we set it dirty */
         mark_inode_dirty(inode);
  fail:
+       trace_f2fs_unlink_exit(inode, err);
         return err;
  }
  
@@ -251,7 +272,7 @@ static int f2fs_symlink(struct inode *dir, struct dentry *dentry,
         struct f2fs_sb_info *sbi = F2FS_SB(sb);
         struct inode *inode;
         size_t symlen = strlen(symname) + 1;
-       int err;
+       int err, ilock;
  
         f2fs_balance_fs(sbi);
  
@@ -262,7 +283,9 @@ static int f2fs_symlink(struct inode *dir, struct dentry *dentry,
         inode->i_op = &f2fs_symlink_inode_operations;
         inode->i_mapping->a_ops = &f2fs_dblock_aops;
  
+       ilock = mutex_lock_op(sbi);
         err = f2fs_add_link(dentry, inode);
+       mutex_unlock_op(sbi, ilock);
         if (err)
                 goto out;
  
@@ -275,6 +298,7 @@ static int f2fs_symlink(struct inode *dir, struct dentry *dentry,
  out:
         clear_nlink(inode);
         unlock_new_inode(inode);
+       make_bad_inode(inode);
         iput(inode);
         alloc_nid_failed(sbi, inode->i_ino);
         return err;
@@ -284,7 +308,7 @@ static int f2fs_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode)
  {
         struct f2fs_sb_info *sbi = F2FS_SB(dir->i_sb);
         struct inode *inode;
-       int err;
+       int err, ilock;
  
         f2fs_balance_fs(sbi);
  
@@ -298,7 +322,9 @@ static int f2fs_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode)
         mapping_set_gfp_mask(inode->i_mapping, GFP_F2FS_ZERO);
  
         set_inode_flag(F2FS_I(inode), FI_INC_LINK);
+       ilock = mutex_lock_op(sbi);
         err = f2fs_add_link(dentry, inode);
+       mutex_unlock_op(sbi, ilock);
         if (err)
                 goto out_fail;
  
@@ -313,6 +339,7 @@ out_fail:
         clear_inode_flag(F2FS_I(inode), FI_INC_LINK);
         clear_nlink(inode);
         unlock_new_inode(inode);
+       make_bad_inode(inode);
         iput(inode);
         alloc_nid_failed(sbi, inode->i_ino);
         return err;
@@ -333,6 +360,7 @@ static int f2fs_mknod(struct inode *dir, struct dentry *dentry,
         struct f2fs_sb_info *sbi = F2FS_SB(sb);
         struct inode *inode;
         int err = 0;
+       int ilock;
  
         if (!new_valid_dev(rdev))
                 return -EINVAL;
@@ -346,7 +374,9 @@ static int f2fs_mknod(struct inode *dir, struct dentry *dentry,
         init_special_inode(inode, inode->i_mode, rdev);
         inode->i_op = &f2fs_special_inode_operations;
  
+       ilock = mutex_lock_op(sbi);
         err = f2fs_add_link(dentry, inode);
+       mutex_unlock_op(sbi, ilock);
         if (err)
                 goto out;
  
@@ -357,6 +387,7 @@ static int f2fs_mknod(struct inode *dir, struct dentry *dentry,
  out:
         clear_nlink(inode);
         unlock_new_inode(inode);
+       make_bad_inode(inode);
         iput(inode);
         alloc_nid_failed(sbi, inode->i_ino);
         return err;
@@ -374,7 +405,7 @@ static int f2fs_rename(struct inode *old_dir, struct dentry *old_dentry,
         struct f2fs_dir_entry *old_dir_entry = NULL;
         struct f2fs_dir_entry *old_entry;
         struct f2fs_dir_entry *new_entry;
-       int err = -ENOENT;
+       int err = -ENOENT, ilock = -1;
  
         f2fs_balance_fs(sbi);
  
@@ -389,7 +420,7 @@ static int f2fs_rename(struct inode *old_dir, struct dentry *old_dentry,
                         goto out_old;
         }
  
-       mutex_lock_op(sbi, RENAME);
+       ilock = mutex_lock_op(sbi);
  
         if (new_inode) {
                 struct page *new_page;
@@ -412,7 +443,7 @@ static int f2fs_rename(struct inode *old_dir, struct dentry *old_dentry,
                 drop_nlink(new_inode);
                 if (!new_inode->i_nlink)
                         add_orphan_inode(sbi, new_inode->i_ino);
-               f2fs_write_inode(new_inode, NULL);
+               update_inode_page(new_inode);
         } else {
                 err = f2fs_add_link(new_dentry, old_inode);
                 if (err)
@@ -420,12 +451,11 @@ static int f2fs_rename(struct inode *old_dir, struct dentry *old_dentry,
  
                 if (old_dir_entry) {
                         inc_nlink(new_dir);
-                       f2fs_write_inode(new_dir, NULL);
+                       update_inode_page(new_dir);
                 }
         }
  
         old_inode->i_ctime = CURRENT_TIME;
-       set_inode_flag(F2FS_I(old_inode), FI_NEED_CP);
         mark_inode_dirty(old_inode);
  
         f2fs_delete_entry(old_entry, old_page, NULL);
@@ -439,10 +469,10 @@ static int f2fs_rename(struct inode *old_dir, struct dentry *old_dentry,
                         f2fs_put_page(old_dir_page, 0);
                 }
                 drop_nlink(old_dir);
-               f2fs_write_inode(old_dir, NULL);
+               update_inode_page(old_dir);
         }
  
-       mutex_unlock_op(sbi, RENAME);
+       mutex_unlock_op(sbi, ilock);
         return 0;
  
  out_dir:
@@ -450,7 +480,7 @@ out_dir:
                 kunmap(old_dir_page);
                 f2fs_put_page(old_dir_page, 0);
         }
-       mutex_unlock_op(sbi, RENAME);
+       mutex_unlock_op(sbi, ilock);
  out_old:
         kunmap(old_page);
         f2fs_put_page(old_page, 0);
diff --git a/fs/f2fs/node.c b/fs/f2fs/node.c

index e275218904ed964126aaae36598d7f13433f77c3..3df43b4efd89e96e971263e61369dee5d3e39b87 100644 (file)
--- a/fs/f2fs/node.c
+++ b/fs/f2fs/node.c
@@ -19,6 +19,7 @@
  #include "f2fs.h"
  #include "node.h"
  #include "segment.h"
+#include <trace/events/f2fs.h>
  
  static struct kmem_cache *nat_entry_slab;
  static struct kmem_cache *free_nid_slab;
@@ -88,10 +89,13 @@ static void ra_nat_pages(struct f2fs_sb_info *sbi, int nid)
  {
         struct address_space *mapping = sbi->meta_inode->i_mapping;
         struct f2fs_nm_info *nm_i = NM_I(sbi);
+       struct blk_plug plug;
         struct page *page;
         pgoff_t index;
         int i;
  
+       blk_start_plug(&plug);
+
         for (i = 0; i < FREE_NID_PAGES; i++, nid += NAT_ENTRY_PER_BLOCK) {
                 if (nid >= nm_i->max_nid)
                         nid = 0;
@@ -100,12 +104,16 @@ static void ra_nat_pages(struct f2fs_sb_info *sbi, int nid)
                 page = grab_cache_page(mapping, index);
                 if (!page)
                         continue;
-               if (f2fs_readpage(sbi, page, index, READ)) {
+               if (PageUptodate(page)) {
                         f2fs_put_page(page, 1);
                         continue;
                 }
+               if (f2fs_readpage(sbi, page, index, READ))
+                       continue;
+
                 f2fs_put_page(page, 0);
         }
+       blk_finish_plug(&plug);
  }
  
  static struct nat_entry *__lookup_nat_cache(struct f2fs_nm_info *nm_i, nid_t n)
@@ -236,7 +244,7 @@ static int try_to_free_nats(struct f2fs_sb_info *sbi, int nr_shrink)
  {
         struct f2fs_nm_info *nm_i = NM_I(sbi);
  
-       if (nm_i->nat_cnt < 2 * NM_WOUT_THRESHOLD)
+       if (nm_i->nat_cnt <= NM_WOUT_THRESHOLD)
                 return 0;
  
         write_lock(&nm_i->nat_tree_lock);
@@ -320,15 +328,14 @@ static int get_node_path(long block, int offset[4], unsigned int noffset[4])
         noffset[0] = 0;
  
         if (block < direct_index) {
-               offset[n++] = block;
-               level = 0;
+               offset[n] = block;
                 goto got;
         }
         block -= direct_index;
         if (block < direct_blks) {
                 offset[n++] = NODE_DIR1_BLOCK;
                 noffset[n] = 1;
-               offset[n++] = block;
+               offset[n] = block;
                 level = 1;
                 goto got;
         }
@@ -336,7 +343,7 @@ static int get_node_path(long block, int offset[4], unsigned int noffset[4])
         if (block < direct_blks) {
                 offset[n++] = NODE_DIR2_BLOCK;
                 noffset[n] = 2;
-               offset[n++] = block;
+               offset[n] = block;
                 level = 1;
                 goto got;
         }
@@ -346,7 +353,7 @@ static int get_node_path(long block, int offset[4], unsigned int noffset[4])
                 noffset[n] = 3;
                 offset[n++] = block / direct_blks;
                 noffset[n] = 4 + offset[n - 1];
-               offset[n++] = block % direct_blks;
+               offset[n] = block % direct_blks;
                 level = 2;
                 goto got;
         }
@@ -356,7 +363,7 @@ static int get_node_path(long block, int offset[4], unsigned int noffset[4])
                 noffset[n] = 4 + dptrs_per_blk;
                 offset[n++] = block / direct_blks;
                 noffset[n] = 5 + dptrs_per_blk + offset[n - 1];
-               offset[n++] = block % direct_blks;
+               offset[n] = block % direct_blks;
                 level = 2;
                 goto got;
         }
@@ -371,7 +378,7 @@ static int get_node_path(long block, int offset[4], unsigned int noffset[4])
                 noffset[n] = 7 + (dptrs_per_blk * 2) +
                               offset[n - 2] * (dptrs_per_blk + 1) +
                               offset[n - 1];
-               offset[n++] = block % direct_blks;
+               offset[n] = block % direct_blks;
                 level = 3;
                 goto got;
         } else {
@@ -383,8 +390,11 @@ got:
  
  /*
   * Caller should call f2fs_put_dnode(dn).
+ * Also, it should grab and release a mutex by calling mutex_lock_op() and
+ * mutex_unlock_op() only if ro is not set RDONLY_NODE.
+ * In the case of RDONLY_NODE, we don't need to care about mutex.
   */
-int get_dnode_of_data(struct dnode_of_data *dn, pgoff_t index, int ro)
+int get_dnode_of_data(struct dnode_of_data *dn, pgoff_t index, int mode)
  {
         struct f2fs_sb_info *sbi = F2FS_SB(dn->inode->i_sb);
         struct page *npage[4];
@@ -403,7 +413,8 @@ int get_dnode_of_data(struct dnode_of_data *dn, pgoff_t index, int ro)
                 return PTR_ERR(npage[0]);
  
         parent = npage[0];
-       nids[1] = get_nid(parent, offset[0], true);
+       if (level != 0)
+               nids[1] = get_nid(parent, offset[0], true);
         dn->inode_page = npage[0];
         dn->inode_page_locked = true;
  
@@ -411,12 +422,9 @@ int get_dnode_of_data(struct dnode_of_data *dn, pgoff_t index, int ro)
         for (i = 1; i <= level; i++) {
                 bool done = false;
  
-               if (!nids[i] && !ro) {
-                       mutex_lock_op(sbi, NODE_NEW);
-
+               if (!nids[i] && mode == ALLOC_NODE) {
                         /* alloc new node */
                         if (!alloc_nid(sbi, &(nids[i]))) {
-                               mutex_unlock_op(sbi, NODE_NEW);
                                 err = -ENOSPC;
                                 goto release_pages;
                         }
@@ -425,16 +433,14 @@ int get_dnode_of_data(struct dnode_of_data *dn, pgoff_t index, int ro)
                         npage[i] = new_node_page(dn, noffset[i]);
                         if (IS_ERR(npage[i])) {
                                 alloc_nid_failed(sbi, nids[i]);
-                               mutex_unlock_op(sbi, NODE_NEW);
                                 err = PTR_ERR(npage[i]);
                                 goto release_pages;
                         }
  
                         set_nid(parent, offset[i - 1], nids[i], i == 1);
                         alloc_nid_done(sbi, nids[i]);
-                       mutex_unlock_op(sbi, NODE_NEW);
                         done = true;
-               } else if (ro && i == level && level > 1) {
+               } else if (mode == LOOKUP_NODE_RA && i == level && level > 1) {
                         npage[i] = get_node_page_ra(parent, offset[i - 1]);
                         if (IS_ERR(npage[i])) {
                                 err = PTR_ERR(npage[i]);
@@ -507,6 +513,7 @@ invalidate:
  
         f2fs_put_page(dn->node_page, 1);
         dn->node_page = NULL;
+       trace_f2fs_truncate_node(dn->inode, dn->nid, ni.blk_addr);
  }
  
  static int truncate_dnode(struct dnode_of_data *dn)
@@ -547,9 +554,13 @@ static int truncate_nodes(struct dnode_of_data *dn, unsigned int nofs,
         if (dn->nid == 0)
                 return NIDS_PER_BLOCK + 1;
  
+       trace_f2fs_truncate_nodes_enter(dn->inode, dn->nid, dn->data_blkaddr);
+
         page = get_node_page(sbi, dn->nid);
-       if (IS_ERR(page))
+       if (IS_ERR(page)) {
+               trace_f2fs_truncate_nodes_exit(dn->inode, PTR_ERR(page));
                 return PTR_ERR(page);
+       }
  
         rn = (struct f2fs_node *)page_address(page);
         if (depth < 3) {
@@ -591,10 +602,12 @@ static int truncate_nodes(struct dnode_of_data *dn, unsigned int nofs,
         } else {
                 f2fs_put_page(page, 1);
         }
+       trace_f2fs_truncate_nodes_exit(dn->inode, freed);
         return freed;
  
  out_err:
         f2fs_put_page(page, 1);
+       trace_f2fs_truncate_nodes_exit(dn->inode, ret);
         return ret;
  }
  
@@ -649,6 +662,9 @@ static int truncate_partial_nodes(struct dnode_of_data *dn,
  fail:
         for (i = depth - 3; i >= 0; i--)
                 f2fs_put_page(pages[i], 1);
+
+       trace_f2fs_truncate_partial_nodes(dn->inode, nid, depth, err);
+
         return err;
  }
  
@@ -658,6 +674,7 @@ fail:
  int truncate_inode_blocks(struct inode *inode, pgoff_t from)
  {
         struct f2fs_sb_info *sbi = F2FS_SB(inode->i_sb);
+       struct address_space *node_mapping = sbi->node_inode->i_mapping;
         int err = 0, cont = 1;
         int level, offset[4], noffset[4];
         unsigned int nofs = 0;
@@ -665,11 +682,15 @@ int truncate_inode_blocks(struct inode *inode, pgoff_t from)
         struct dnode_of_data dn;
         struct page *page;
  
-       level = get_node_path(from, offset, noffset);
+       trace_f2fs_truncate_inode_blocks_enter(inode, from);
  
+       level = get_node_path(from, offset, noffset);
+restart:
         page = get_node_page(sbi, inode->i_ino);
-       if (IS_ERR(page))
+       if (IS_ERR(page)) {
+               trace_f2fs_truncate_inode_blocks_exit(inode, PTR_ERR(page));
                 return PTR_ERR(page);
+       }
  
         set_new_dnode(&dn, inode, page, NULL, 0);
         unlock_page(page);
@@ -728,6 +749,10 @@ skip_partial:
                 if (offset[1] == 0 &&
                                 rn->i.i_nid[offset[0] - NODE_DIR1_BLOCK]) {
                         lock_page(page);
+                       if (page->mapping != node_mapping) {
+                               f2fs_put_page(page, 1);
+                               goto restart;
+                       }
                         wait_on_page_writeback(page);
                         rn->i.i_nid[offset[0] - NODE_DIR1_BLOCK] = 0;
                         set_page_dirty(page);
@@ -739,9 +764,14 @@ skip_partial:
         }
  fail:
         f2fs_put_page(page, 0);
+       trace_f2fs_truncate_inode_blocks_exit(inode, err);
         return err > 0 ? 0 : err;
  }
  
+/*
+ * Caller should grab and release a mutex by calling mutex_lock_op() and
+ * mutex_unlock_op().
+ */
  int remove_inode_page(struct inode *inode)
  {
         struct f2fs_sb_info *sbi = F2FS_SB(inode->i_sb);
@@ -749,21 +779,16 @@ int remove_inode_page(struct inode *inode)
         nid_t ino = inode->i_ino;
         struct dnode_of_data dn;
  
-       mutex_lock_op(sbi, NODE_TRUNC);
         page = get_node_page(sbi, ino);
-       if (IS_ERR(page)) {
-               mutex_unlock_op(sbi, NODE_TRUNC);
+       if (IS_ERR(page))
                 return PTR_ERR(page);
-       }
  
         if (F2FS_I(inode)->i_xattr_nid) {
                 nid_t nid = F2FS_I(inode)->i_xattr_nid;
                 struct page *npage = get_node_page(sbi, nid);
  
-               if (IS_ERR(npage)) {
-                       mutex_unlock_op(sbi, NODE_TRUNC);
+               if (IS_ERR(npage))
                         return PTR_ERR(npage);
-               }
  
                 F2FS_I(inode)->i_xattr_nid = 0;
                 set_new_dnode(&dn, inode, page, npage, nid);
@@ -775,23 +800,18 @@ int remove_inode_page(struct inode *inode)
         BUG_ON(inode->i_blocks != 0 && inode->i_blocks != 1);
         set_new_dnode(&dn, inode, page, page, ino);
         truncate_node(&dn);
-
-       mutex_unlock_op(sbi, NODE_TRUNC);
         return 0;
  }
  
  int new_inode_page(struct inode *inode, const struct qstr *name)
  {
-       struct f2fs_sb_info *sbi = F2FS_SB(inode->i_sb);
         struct page *page;
         struct dnode_of_data dn;
  
         /* allocate inode page for new inode */
         set_new_dnode(&dn, inode, NULL, NULL, inode->i_ino);
-       mutex_lock_op(sbi, NODE_NEW);
         page = new_node_page(&dn, 0);
         init_dent_inode(name, page);
-       mutex_unlock_op(sbi, NODE_NEW);
         if (IS_ERR(page))
                 return PTR_ERR(page);
         f2fs_put_page(page, 1);
@@ -844,6 +864,12 @@ fail:
         return ERR_PTR(err);
  }
  
+/*
+ * Caller should do after getting the following values.
+ * 0: f2fs_put_page(page, 0)
+ * LOCKED_PAGE: f2fs_put_page(page, 1)
+ * error: nothing
+ */
  static int read_node_page(struct page *page, int type)
  {
         struct f2fs_sb_info *sbi = F2FS_SB(page->mapping->host->i_sb);
@@ -851,8 +877,14 @@ static int read_node_page(struct page *page, int type)
  
         get_node_info(sbi, page->index, &ni);
  
-       if (ni.blk_addr == NULL_ADDR)
+       if (ni.blk_addr == NULL_ADDR) {
+               f2fs_put_page(page, 1);
                 return -ENOENT;
+       }
+
+       if (PageUptodate(page))
+               return LOCKED_PAGE;
+
         return f2fs_readpage(sbi, page, ni.blk_addr, type);
  }
  
@@ -863,40 +895,53 @@ void ra_node_page(struct f2fs_sb_info *sbi, nid_t nid)
  {
         struct address_space *mapping = sbi->node_inode->i_mapping;
         struct page *apage;
+       int err;
  
         apage = find_get_page(mapping, nid);
-       if (apage && PageUptodate(apage))
-               goto release_out;
+       if (apage && PageUptodate(apage)) {
+               f2fs_put_page(apage, 0);
+               return;
+       }
         f2fs_put_page(apage, 0);
  
         apage = grab_cache_page(mapping, nid);
         if (!apage)
                 return;
  
-       if (read_node_page(apage, READA))
-               unlock_page(apage);
-
-release_out:
-       f2fs_put_page(apage, 0);
+       err = read_node_page(apage, READA);
+       if (err == 0)
+               f2fs_put_page(apage, 0);
+       else if (err == LOCKED_PAGE)
+               f2fs_put_page(apage, 1);
         return;
  }
  
  struct page *get_node_page(struct f2fs_sb_info *sbi, pgoff_t nid)
  {
-       int err;
-       struct page *page;
         struct address_space *mapping = sbi->node_inode->i_mapping;
-
+       struct page *page;
+       int err;
+repeat:
         page = grab_cache_page(mapping, nid);
         if (!page)
                 return ERR_PTR(-ENOMEM);
  
         err = read_node_page(page, READ_SYNC);
-       if (err) {
-               f2fs_put_page(page, 1);
+       if (err < 0)
                 return ERR_PTR(err);
-       }
+       else if (err == LOCKED_PAGE)
+               goto got_it;
  
+       lock_page(page);
+       if (!PageUptodate(page)) {
+               f2fs_put_page(page, 1);
+               return ERR_PTR(-EIO);
+       }
+       if (page->mapping != mapping) {
+               f2fs_put_page(page, 1);
+               goto repeat;
+       }
+got_it:
         BUG_ON(nid != nid_of_node(page));
         mark_page_accessed(page);
         return page;
@@ -910,31 +955,27 @@ struct page *get_node_page_ra(struct page *parent, int start)
  {
         struct f2fs_sb_info *sbi = F2FS_SB(parent->mapping->host->i_sb);
         struct address_space *mapping = sbi->node_inode->i_mapping;
-       int i, end;
-       int err = 0;
-       nid_t nid;
+       struct blk_plug plug;
         struct page *page;
+       int err, i, end;
+       nid_t nid;
  
         /* First, try getting the desired direct node. */
         nid = get_nid(parent, start, false);
         if (!nid)
                 return ERR_PTR(-ENOENT);
-
-       page = find_get_page(mapping, nid);
-       if (page && PageUptodate(page))
-               goto page_hit;
-       f2fs_put_page(page, 0);
-
  repeat:
         page = grab_cache_page(mapping, nid);
         if (!page)
                 return ERR_PTR(-ENOMEM);
  
-       err = read_node_page(page, READA);
-       if (err) {
-               f2fs_put_page(page, 1);
+       err = read_node_page(page, READ_SYNC);
+       if (err < 0)
                 return ERR_PTR(err);
-       }
+       else if (err == LOCKED_PAGE)
+               goto page_hit;
+
+       blk_start_plug(&plug);
  
         /* Then, try readahead for siblings of the desired node */
         end = start + MAX_RA_NODE;
@@ -946,18 +987,19 @@ repeat:
                 ra_node_page(sbi, nid);
         }
  
-page_hit:
-       lock_page(page);
-       if (PageError(page)) {
-               f2fs_put_page(page, 1);
-               return ERR_PTR(-EIO);
-       }
+       blk_finish_plug(&plug);
  
-       /* Has the page been truncated? */
+       lock_page(page);
         if (page->mapping != mapping) {
                 f2fs_put_page(page, 1);
                 goto repeat;
         }
+page_hit:
+       if (!PageUptodate(page)) {
+               f2fs_put_page(page, 1);
+               return ERR_PTR(-EIO);
+       }
+       mark_page_accessed(page);
         return page;
  }
  
@@ -972,7 +1014,7 @@ void sync_inode_page(struct dnode_of_data *dn)
                 if (!dn->inode_page_locked)
                         unlock_page(dn->inode_page);
         } else {
-               f2fs_write_inode(dn->inode, NULL);
+               update_inode_page(dn->inode);
         }
  }
  
@@ -1087,17 +1129,8 @@ static int f2fs_write_node_page(struct page *page,
         block_t new_addr;
         struct node_info ni;
  
-       if (wbc->for_reclaim) {
-               dec_page_count(sbi, F2FS_DIRTY_NODES);
-               wbc->pages_skipped++;
-               set_page_dirty(page);
-               return AOP_WRITEPAGE_ACTIVATE;
-       }
-
         wait_on_page_writeback(page);
  
-       mutex_lock_op(sbi, NODE_WRITE);
-
         /* get old block addr of this node page */
         nid = nid_of_node(page);
         BUG_ON(page->index != nid);
@@ -1105,17 +1138,25 @@ static int f2fs_write_node_page(struct page *page,
         get_node_info(sbi, nid, &ni);
  
         /* This page is already truncated */
-       if (ni.blk_addr == NULL_ADDR)
+       if (ni.blk_addr == NULL_ADDR) {
+               dec_page_count(sbi, F2FS_DIRTY_NODES);
+               unlock_page(page);
                 return 0;
+       }
  
-       set_page_writeback(page);
+       if (wbc->for_reclaim) {
+               dec_page_count(sbi, F2FS_DIRTY_NODES);
+               wbc->pages_skipped++;
+               set_page_dirty(page);
+               return AOP_WRITEPAGE_ACTIVATE;
+       }
  
-       /* insert node offset */
+       mutex_lock(&sbi->node_write);
+       set_page_writeback(page);
         write_node_page(sbi, page, nid, ni.blk_addr, &new_addr);
         set_node_addr(sbi, &ni, new_addr);
         dec_page_count(sbi, F2FS_DIRTY_NODES);
-
-       mutex_unlock_op(sbi, NODE_WRITE);
+       mutex_unlock(&sbi->node_write);
         unlock_page(page);
         return 0;
  }
@@ -1130,12 +1171,11 @@ static int f2fs_write_node_pages(struct address_space *mapping,
                             struct writeback_control *wbc)
  {
         struct f2fs_sb_info *sbi = F2FS_SB(mapping->host->i_sb);
-       struct block_device *bdev = sbi->sb->s_bdev;
         long nr_to_write = wbc->nr_to_write;
  
         /* First check balancing cached NAT entries */
         if (try_to_free_nats(sbi, NAT_ENTRY_PER_BLOCK)) {
-               write_checkpoint(sbi, false);
+               f2fs_sync_fs(sbi->sb, true);
                 return 0;
         }
  
@@ -1144,10 +1184,9 @@ static int f2fs_write_node_pages(struct address_space *mapping,
                 return 0;
  
         /* if mounting is failed, skip writing node pages */
-       wbc->nr_to_write = bio_get_nr_vecs(bdev);
+       wbc->nr_to_write = max_hw_blocks(sbi);
         sync_node_pages(sbi, 0, wbc);
-       wbc->nr_to_write = nr_to_write -
-               (bio_get_nr_vecs(bdev) - wbc->nr_to_write);
+       wbc->nr_to_write = nr_to_write - (max_hw_blocks(sbi) - wbc->nr_to_write);
         return 0;
  }
  
@@ -1178,7 +1217,7 @@ static void f2fs_invalidate_node_page(struct page *page, unsigned long offset)
  static int f2fs_release_node_page(struct page *page, gfp_t wait)
  {
         ClearPagePrivate(page);
-       return 0;
+       return 1;
  }
  
  /*
@@ -1195,14 +1234,13 @@ const struct address_space_operations f2fs_node_aops = {
  static struct free_nid *__lookup_free_nid_list(nid_t n, struct list_head *head)
  {
         struct list_head *this;
-       struct free_nid *i = NULL;
+       struct free_nid *i;
         list_for_each(this, head) {
                 i = list_entry(this, struct free_nid, list);
                 if (i->nid == n)
-                       break;
-               i = NULL;
+                       return i;
         }
-       return i;
+       return NULL;
  }
  
  static void __del_from_free_nid_list(struct free_nid *i)
@@ -1211,11 +1249,29 @@ static void __del_from_free_nid_list(struct free_nid *i)
         kmem_cache_free(free_nid_slab, i);
  }
  
-static int add_free_nid(struct f2fs_nm_info *nm_i, nid_t nid)
+static int add_free_nid(struct f2fs_nm_info *nm_i, nid_t nid, bool build)
  {
         struct free_nid *i;
+       struct nat_entry *ne;
+       bool allocated = false;
  
         if (nm_i->fcnt > 2 * MAX_FREE_NIDS)
+               return -1;
+
+       /* 0 nid should not be used */
+       if (nid == 0)
+               return 0;
+
+       if (!build)
+               goto retry;
+
+       /* do not add allocated nids */
+       read_lock(&nm_i->nat_tree_lock);
+       ne = __lookup_nat_cache(nm_i, nid);
+       if (ne && nat_get_blkaddr(ne) != NULL_ADDR)
+               allocated = true;
+       read_unlock(&nm_i->nat_tree_lock);
+       if (allocated)
                 return 0;
  retry:
         i = kmem_cache_alloc(free_nid_slab, GFP_NOFS);
@@ -1250,63 +1306,59 @@ static void remove_free_nid(struct f2fs_nm_info *nm_i, nid_t nid)
         spin_unlock(&nm_i->free_nid_list_lock);
  }
  
-static int scan_nat_page(struct f2fs_nm_info *nm_i,
+static void scan_nat_page(struct f2fs_nm_info *nm_i,
                         struct page *nat_page, nid_t start_nid)
  {
         struct f2fs_nat_block *nat_blk = page_address(nat_page);
         block_t blk_addr;
-       int fcnt = 0;
         int i;
  
-       /* 0 nid should not be used */
-       if (start_nid == 0)
-               ++start_nid;
-
         i = start_nid % NAT_ENTRY_PER_BLOCK;
  
         for (; i < NAT_ENTRY_PER_BLOCK; i++, start_nid++) {
-               blk_addr  = le32_to_cpu(nat_blk->entries[i].block_addr);
+
+               if (start_nid >= nm_i->max_nid)
+                       break;
+
+               blk_addr = le32_to_cpu(nat_blk->entries[i].block_addr);
                 BUG_ON(blk_addr == NEW_ADDR);
-               if (blk_addr == NULL_ADDR)
-                       fcnt += add_free_nid(nm_i, start_nid);
+               if (blk_addr == NULL_ADDR) {
+                       if (add_free_nid(nm_i, start_nid, true) < 0)
+                               break;
+               }
         }
-       return fcnt;
  }
  
  static void build_free_nids(struct f2fs_sb_info *sbi)
  {
-       struct free_nid *fnid, *next_fnid;
         struct f2fs_nm_info *nm_i = NM_I(sbi);
         struct curseg_info *curseg = CURSEG_I(sbi, CURSEG_HOT_DATA);
         struct f2fs_summary_block *sum = curseg->sum_blk;
-       nid_t nid = 0;
-       bool is_cycled = false;
-       int fcnt = 0;
-       int i;
+       int i = 0;
+       nid_t nid = nm_i->next_scan_nid;
  
-       nid = nm_i->next_scan_nid;
-       nm_i->init_scan_nid = nid;
+       /* Enough entries */
+       if (nm_i->fcnt > NAT_ENTRY_PER_BLOCK)
+               return;
  
+       /* readahead nat pages to be scanned */
         ra_nat_pages(sbi, nid);
  
         while (1) {
                 struct page *page = get_current_nat_page(sbi, nid);
  
-               fcnt += scan_nat_page(nm_i, page, nid);
+               scan_nat_page(nm_i, page, nid);
                 f2fs_put_page(page, 1);
  
                 nid += (NAT_ENTRY_PER_BLOCK - (nid % NAT_ENTRY_PER_BLOCK));
-
-               if (nid >= nm_i->max_nid) {
+               if (nid >= nm_i->max_nid)
                         nid = 0;
-                       is_cycled = true;
-               }
-               if (fcnt > MAX_FREE_NIDS)
-                       break;
-               if (is_cycled && nm_i->init_scan_nid <= nid)
+
+               if (i++ == FREE_NID_PAGES)
                         break;
         }
  
+       /* go to the next free nat pages to find free nids abundantly */
         nm_i->next_scan_nid = nid;
  
         /* find free nids from current sum_pages */
@@ -1315,22 +1367,11 @@ static void build_free_nids(struct f2fs_sb_info *sbi)
                 block_t addr = le32_to_cpu(nat_in_journal(sum, i).block_addr);
                 nid = le32_to_cpu(nid_in_journal(sum, i));
                 if (addr == NULL_ADDR)
-                       add_free_nid(nm_i, nid);
+                       add_free_nid(nm_i, nid, true);
                 else
                         remove_free_nid(nm_i, nid);
         }
         mutex_unlock(&curseg->curseg_mutex);
-
-       /* remove the free nids from current allocated nids */
-       list_for_each_entry_safe(fnid, next_fnid, &nm_i->free_nid_list, list) {
-               struct nat_entry *ne;
-
-               read_lock(&nm_i->nat_tree_lock);
-               ne = __lookup_nat_cache(nm_i, fnid->nid);
-               if (ne && nat_get_blkaddr(ne) != NULL_ADDR)
-                       remove_free_nid(nm_i, fnid->nid);
-               read_unlock(&nm_i->nat_tree_lock);
-       }
  }
  
  /*
@@ -1344,41 +1385,36 @@ bool alloc_nid(struct f2fs_sb_info *sbi, nid_t *nid)
         struct free_nid *i = NULL;
         struct list_head *this;
  retry:
-       mutex_lock(&nm_i->build_lock);
-       if (!nm_i->fcnt) {
-               /* scan NAT in order to build free nid list */
-               build_free_nids(sbi);
-               if (!nm_i->fcnt) {
-                       mutex_unlock(&nm_i->build_lock);
-                       return false;
-               }
-       }
-       mutex_unlock(&nm_i->build_lock);
+       if (sbi->total_valid_node_count + 1 >= nm_i->max_nid)
+               return false;
  
-       /*
-        * We check fcnt again since previous check is racy as
-        * we didn't hold free_nid_list_lock. So other thread
-        * could consume all of free nids.
-        */
         spin_lock(&nm_i->free_nid_list_lock);
-       if (!nm_i->fcnt) {
-               spin_unlock(&nm_i->free_nid_list_lock);
-               goto retry;
-       }
  
-       BUG_ON(list_empty(&nm_i->free_nid_list));
-       list_for_each(this, &nm_i->free_nid_list) {
-               i = list_entry(this, struct free_nid, list);
-               if (i->state == NID_NEW)
-                       break;
-       }
+       /* We should not use stale free nids created by build_free_nids */
+       if (nm_i->fcnt && !sbi->on_build_free_nids) {
+               BUG_ON(list_empty(&nm_i->free_nid_list));
+               list_for_each(this, &nm_i->free_nid_list) {
+                       i = list_entry(this, struct free_nid, list);
+                       if (i->state == NID_NEW)
+                               break;
+               }
  
-       BUG_ON(i->state != NID_NEW);
-       *nid = i->nid;
-       i->state = NID_ALLOC;
-       nm_i->fcnt--;
+               BUG_ON(i->state != NID_NEW);
+               *nid = i->nid;
+               i->state = NID_ALLOC;
+               nm_i->fcnt--;
+               spin_unlock(&nm_i->free_nid_list_lock);
+               return true;
+       }
         spin_unlock(&nm_i->free_nid_list_lock);
-       return true;
+
+       /* Let's scan nat pages and its caches to get free nids */
+       mutex_lock(&nm_i->build_lock);
+       sbi->on_build_free_nids = 1;
+       build_free_nids(sbi);
+       sbi->on_build_free_nids = 0;
+       mutex_unlock(&nm_i->build_lock);
+       goto retry;
  }
  
  /*
@@ -1391,10 +1427,8 @@ void alloc_nid_done(struct f2fs_sb_info *sbi, nid_t nid)
  
         spin_lock(&nm_i->free_nid_list_lock);
         i = __lookup_free_nid_list(nid, &nm_i->free_nid_list);
-       if (i) {
-               BUG_ON(i->state != NID_ALLOC);
-               __del_from_free_nid_list(i);
-       }
+       BUG_ON(!i || i->state != NID_ALLOC);
+       __del_from_free_nid_list(i);
         spin_unlock(&nm_i->free_nid_list_lock);
  }
  
@@ -1403,8 +1437,19 @@ void alloc_nid_done(struct f2fs_sb_info *sbi, nid_t nid)
   */
  void alloc_nid_failed(struct f2fs_sb_info *sbi, nid_t nid)
  {
-       alloc_nid_done(sbi, nid);
-       add_free_nid(NM_I(sbi), nid);
+       struct f2fs_nm_info *nm_i = NM_I(sbi);
+       struct free_nid *i;
+
+       spin_lock(&nm_i->free_nid_list_lock);
+       i = __lookup_free_nid_list(nid, &nm_i->free_nid_list);
+       BUG_ON(!i || i->state != NID_ALLOC);
+       if (nm_i->fcnt > 2 * MAX_FREE_NIDS) {
+               __del_from_free_nid_list(i);
+       } else {
+               i->state = NID_NEW;
+               nm_i->fcnt++;
+       }
+       spin_unlock(&nm_i->free_nid_list_lock);
  }
  
  void recover_node_page(struct f2fs_sb_info *sbi, struct page *page,
@@ -1475,23 +1520,24 @@ int restore_node_summary(struct f2fs_sb_info *sbi,
         sum_entry = &sum->entries[0];
  
         for (i = 0; i < last_offset; i++, sum_entry++) {
+               /*
+                * In order to read next node page,
+                * we must clear PageUptodate flag.
+                */
+               ClearPageUptodate(page);
+
                 if (f2fs_readpage(sbi, page, addr, READ_SYNC))
                         goto out;
  
+               lock_page(page);
                 rn = (struct f2fs_node *)page_address(page);
                 sum_entry->nid = rn->footer.nid;
                 sum_entry->version = 0;
                 sum_entry->ofs_in_node = 0;
                 addr++;
-
-               /*
-                * In order to read next node page,
-                * we must clear PageUptodate flag.
-                */
-               ClearPageUptodate(page);
         }
-out:
         unlock_page(page);
+out:
         __free_pages(page, 0);
         return 0;
  }
@@ -1614,13 +1660,11 @@ flush_now:
                         nid_in_journal(sum, offset) = cpu_to_le32(nid);
                 }
  
-               if (nat_get_blkaddr(ne) == NULL_ADDR) {
+               if (nat_get_blkaddr(ne) == NULL_ADDR &&
+                               add_free_nid(NM_I(sbi), nid, false) <= 0) {
                         write_lock(&nm_i->nat_tree_lock);
                         __del_from_nat_cache(nm_i, ne);
                         write_unlock(&nm_i->nat_tree_lock);
-
-                       /* We can reuse this freed nid at this point */
-                       add_free_nid(NM_I(sbi), nid);
                 } else {
                         write_lock(&nm_i->nat_tree_lock);
                         __clear_nat_cache_dirty(nm_i, ne);
@@ -1661,19 +1705,16 @@ static int init_node_manager(struct f2fs_sb_info *sbi)
         spin_lock_init(&nm_i->free_nid_list_lock);
         rwlock_init(&nm_i->nat_tree_lock);
  
-       nm_i->bitmap_size = __bitmap_size(sbi, NAT_BITMAP);
-       nm_i->init_scan_nid = le32_to_cpu(sbi->ckpt->next_free_nid);
         nm_i->next_scan_nid = le32_to_cpu(sbi->ckpt->next_free_nid);
-
-       nm_i->nat_bitmap = kzalloc(nm_i->bitmap_size, GFP_KERNEL);
-       if (!nm_i->nat_bitmap)
-               return -ENOMEM;
+       nm_i->bitmap_size = __bitmap_size(sbi, NAT_BITMAP);
         version_bitmap = __bitmap_ptr(sbi, NAT_BITMAP);
         if (!version_bitmap)
                 return -EFAULT;
  
-       /* copy version bitmap */
-       memcpy(nm_i->nat_bitmap, version_bitmap, nm_i->bitmap_size);
+       nm_i->nat_bitmap = kmemdup(version_bitmap, nm_i->bitmap_size,
+                                       GFP_KERNEL);
+       if (!nm_i->nat_bitmap)
+               return -ENOMEM;
         return 0;
  }
  
diff --git a/fs/f2fs/node.h b/fs/f2fs/node.h

index afdb130f782e4fd962b92f1d73fffe6620e7d02b..0a2d72f0024ddf88c4df50c46d0eda59a00bbb62 100644 (file)
--- a/fs/f2fs/node.h
+++ b/fs/f2fs/node.h
@@ -29,6 +29,9 @@
  /* vector size for gang look-up from nat cache that consists of radix tree */
  #define NATVEC_SIZE    64
  
+/* return value for read_node_page */
+#define LOCKED_PAGE    1
+
  /*
   * For node information
   */
@@ -239,7 +242,7 @@ static inline bool IS_DNODE(struct page *node_page)
                 return false;
         if (ofs >= 6 + 2 * NIDS_PER_BLOCK) {
                 ofs -= 6 + 2 * NIDS_PER_BLOCK;
-               if ((long int)ofs % (NIDS_PER_BLOCK + 1))
+               if (!((long int)ofs % (NIDS_PER_BLOCK + 1)))
                         return false;
         }
         return true;
@@ -277,6 +280,21 @@ static inline int is_cold_file(struct inode *inode)
         return F2FS_I(inode)->i_advise & FADVISE_COLD_BIT;
  }
  
+static inline void set_cold_file(struct inode *inode)
+{
+       F2FS_I(inode)->i_advise |= FADVISE_COLD_BIT;
+}
+
+static inline int is_cp_file(struct inode *inode)
+{
+       return F2FS_I(inode)->i_advise & FADVISE_CP_BIT;
+}
+
+static inline void set_cp_file(struct inode *inode)
+{
+       F2FS_I(inode)->i_advise |= FADVISE_CP_BIT;
+}
+
  static inline int is_cold_data(struct page *page)
  {
         return PageChecked(page);
diff --git a/fs/f2fs/recovery.c b/fs/f2fs/recovery.c

index b235215ac13816feea8a960e05f40bb03b9dadf9..60c8a5097058f02e02c28979dad759a7d734438a 100644 (file)
--- a/fs/f2fs/recovery.c
+++ b/fs/f2fs/recovery.c
@@ -53,7 +53,7 @@ static int recover_dentry(struct page *ipage, struct inode *inode)
  
         dir = f2fs_iget(inode->i_sb, le32_to_cpu(raw_inode->i_pino));
         if (IS_ERR(dir)) {
-               err = -EINVAL;
+               err = PTR_ERR(dir);
                 goto out;
         }
  
@@ -112,11 +112,14 @@ static int find_fsync_dnodes(struct f2fs_sb_info *sbi, struct list_head *head)
         while (1) {
                 struct fsync_inode_entry *entry;
  
-               if (f2fs_readpage(sbi, page, blkaddr, READ_SYNC))
+               err = f2fs_readpage(sbi, page, blkaddr, READ_SYNC);
+               if (err)
                         goto out;
  
+               lock_page(page);
+
                 if (cp_ver != cpver_of_node(page))
-                       goto out;
+                       goto unlock_out;
  
                 if (!is_fsync_dnode(page))
                         goto next;
@@ -129,24 +132,23 @@ static int find_fsync_dnodes(struct f2fs_sb_info *sbi, struct list_head *head)
                                                         FI_INC_LINK);
                 } else {
                         if (IS_INODE(page) && is_dent_dnode(page)) {
-                               if (recover_inode_page(sbi, page)) {
-                                       err = -ENOMEM;
-                                       goto out;
-                               }
+                               err = recover_inode_page(sbi, page);
+                               if (err)
+                                       goto unlock_out;
                         }
  
                         /* add this fsync inode to the list */
                         entry = kmem_cache_alloc(fsync_entry_slab, GFP_NOFS);
                         if (!entry) {
                                 err = -ENOMEM;
-                               goto out;
+                               goto unlock_out;
                         }
  
                         entry->inode = f2fs_iget(sbi->sb, ino_of_node(page));
                         if (IS_ERR(entry->inode)) {
                                 err = PTR_ERR(entry->inode);
                                 kmem_cache_free(fsync_entry_slab, entry);
-                               goto out;
+                               goto unlock_out;
                         }
  
                         list_add_tail(&entry->list, head);
@@ -154,16 +156,20 @@ static int find_fsync_dnodes(struct f2fs_sb_info *sbi, struct list_head *head)
                 }
                 if (IS_INODE(page)) {
                         err = recover_inode(entry->inode, page);
-                       if (err)
-                               goto out;
+                       if (err == -ENOENT) {
+                               goto next;
+                       } else if (err) {
+                               err = -EINVAL;
+                               goto unlock_out;
+                       }
                 }
  next:
                 /* check next segment */
                 blkaddr = next_blkaddr_of_node(page);
-               ClearPageUptodate(page);
         }
-out:
+unlock_out:
         unlock_page(page);
+out:
         __free_pages(page, 0);
         return err;
  }
@@ -232,13 +238,15 @@ static void check_index_in_prev_nodes(struct f2fs_sb_info *sbi,
         iput(inode);
  }
  
-static void do_recover_data(struct f2fs_sb_info *sbi, struct inode *inode,
+static int do_recover_data(struct f2fs_sb_info *sbi, struct inode *inode,
                                         struct page *page, block_t blkaddr)
  {
         unsigned int start, end;
         struct dnode_of_data dn;
         struct f2fs_summary sum;
         struct node_info ni;
+       int err = 0;
+       int ilock;
  
         start = start_bidx_of_node(ofs_of_node(page));
         if (IS_INODE(page))
@@ -246,9 +254,14 @@ static void do_recover_data(struct f2fs_sb_info *sbi, struct inode *inode,
         else
                 end = start + ADDRS_PER_BLOCK;
  
+       ilock = mutex_lock_op(sbi);
         set_new_dnode(&dn, inode, NULL, NULL, 0);
-       if (get_dnode_of_data(&dn, start, 0))
-               return;
+
+       err = get_dnode_of_data(&dn, start, ALLOC_NODE);
+       if (err) {
+               mutex_unlock_op(sbi, ilock);
+               return err;
+       }
  
         wait_on_page_writeback(dn.node_page);
  
@@ -293,14 +306,17 @@ static void do_recover_data(struct f2fs_sb_info *sbi, struct inode *inode,
  
         recover_node_page(sbi, dn.node_page, &sum, &ni, blkaddr);
         f2fs_put_dnode(&dn);
+       mutex_unlock_op(sbi, ilock);
+       return 0;
  }
  
-static void recover_data(struct f2fs_sb_info *sbi,
+static int recover_data(struct f2fs_sb_info *sbi,
                                 struct list_head *head, int type)
  {
         unsigned long long cp_ver = le64_to_cpu(sbi->ckpt->checkpoint_ver);
         struct curseg_info *curseg;
         struct page *page;
+       int err = 0;
         block_t blkaddr;
  
         /* get node pages in the current segment */
@@ -310,23 +326,29 @@ static void recover_data(struct f2fs_sb_info *sbi,
         /* read node page */
         page = alloc_page(GFP_NOFS | __GFP_ZERO);
         if (IS_ERR(page))
-               return;
+               return -ENOMEM;
+
         lock_page(page);
  
         while (1) {
                 struct fsync_inode_entry *entry;
  
-               if (f2fs_readpage(sbi, page, blkaddr, READ_SYNC))
+               err = f2fs_readpage(sbi, page, blkaddr, READ_SYNC);
+               if (err)
                         goto out;
  
+               lock_page(page);
+
                 if (cp_ver != cpver_of_node(page))
-                       goto out;
+                       goto unlock_out;
  
                 entry = get_fsync_inode(head, ino_of_node(page));
                 if (!entry)
                         goto next;
  
-               do_recover_data(sbi, entry->inode, page, blkaddr);
+               err = do_recover_data(sbi, entry->inode, page, blkaddr);
+               if (err)
+                       goto out;
  
                 if (entry->blkaddr == blkaddr) {
                         iput(entry->inode);
@@ -336,28 +358,32 @@ static void recover_data(struct f2fs_sb_info *sbi,
  next:
                 /* check next segment */
                 blkaddr = next_blkaddr_of_node(page);
-               ClearPageUptodate(page);
         }
-out:
+unlock_out:
         unlock_page(page);
+out:
         __free_pages(page, 0);
  
-       allocate_new_segments(sbi);
+       if (!err)
+               allocate_new_segments(sbi);
+       return err;
  }
  
-void recover_fsync_data(struct f2fs_sb_info *sbi)
+int recover_fsync_data(struct f2fs_sb_info *sbi)
  {
         struct list_head inode_list;
+       int err;
  
         fsync_entry_slab = f2fs_kmem_cache_create("f2fs_fsync_inode_entry",
                         sizeof(struct fsync_inode_entry), NULL);
         if (unlikely(!fsync_entry_slab))
-               return;
+               return -ENOMEM;
  
         INIT_LIST_HEAD(&inode_list);
  
         /* step #1: find fsynced inode numbers */
-       if (find_fsync_dnodes(sbi, &inode_list))
+       err = find_fsync_dnodes(sbi, &inode_list);
+       if (err)
                 goto out;
  
         if (list_empty(&inode_list))
@@ -365,11 +391,12 @@ void recover_fsync_data(struct f2fs_sb_info *sbi)
  
         /* step #2: recover data */
         sbi->por_doing = 1;
-       recover_data(sbi, &inode_list, CURSEG_WARM_NODE);
+       err = recover_data(sbi, &inode_list, CURSEG_WARM_NODE);
         sbi->por_doing = 0;
         BUG_ON(!list_empty(&inode_list));
  out:
         destroy_fsync_dnodes(sbi, &inode_list);
         kmem_cache_destroy(fsync_entry_slab);
         write_checkpoint(sbi, false);
+       return err;
  }
diff --git a/fs/f2fs/segment.c b/fs/f2fs/segment.c

index 777f17e496e606de7f7e02103e606e291521b1f2..d8e84e49a5c301cb139acfa6d0e06cbd1544b82a 100644 (file)
--- a/fs/f2fs/segment.c
+++ b/fs/f2fs/segment.c
@@ -18,6 +18,7 @@
  #include "f2fs.h"
  #include "segment.h"
  #include "node.h"
+#include <trace/events/f2fs.h>
  
  /*
   * This function balances dirty node and dentry pages.
@@ -49,9 +50,20 @@ static void __locate_dirty_segment(struct f2fs_sb_info *sbi, unsigned int segno,
  
         if (dirty_type == DIRTY) {
                 struct seg_entry *sentry = get_seg_entry(sbi, segno);
+               enum dirty_type t = DIRTY_HOT_DATA;
+
                 dirty_type = sentry->type;
+
                 if (!test_and_set_bit(segno, dirty_i->dirty_segmap[dirty_type]))
                         dirty_i->nr_dirty[dirty_type]++;
+
+               /* Only one bitmap should be set */
+               for (; t <= DIRTY_COLD_NODE; t++) {
+                       if (t == dirty_type)
+                               continue;
+                       if (test_and_clear_bit(segno, dirty_i->dirty_segmap[t]))
+                               dirty_i->nr_dirty[t]--;
+               }
         }
  }
  
@@ -64,13 +76,16 @@ static void __remove_dirty_segment(struct f2fs_sb_info *sbi, unsigned int segno,
                 dirty_i->nr_dirty[dirty_type]--;
  
         if (dirty_type == DIRTY) {
-               struct seg_entry *sentry = get_seg_entry(sbi, segno);
-               dirty_type = sentry->type;
-               if (test_and_clear_bit(segno,
-                                       dirty_i->dirty_segmap[dirty_type]))
-                       dirty_i->nr_dirty[dirty_type]--;
-               clear_bit(segno, dirty_i->victim_segmap[FG_GC]);
-               clear_bit(segno, dirty_i->victim_segmap[BG_GC]);
+               enum dirty_type t = DIRTY_HOT_DATA;
+
+               /* clear all the bitmaps */
+               for (; t <= DIRTY_COLD_NODE; t++)
+                       if (test_and_clear_bit(segno, dirty_i->dirty_segmap[t]))
+                               dirty_i->nr_dirty[t]--;
+
+               if (get_valid_blocks(sbi, segno, sbi->segs_per_sec) == 0)
+                       clear_bit(GET_SECNO(sbi, segno),
+                                               dirty_i->victim_secmap);
         }
  }
  
@@ -296,13 +311,12 @@ static void write_sum_page(struct f2fs_sb_info *sbi,
         f2fs_put_page(page, 1);
  }
  
-static unsigned int check_prefree_segments(struct f2fs_sb_info *sbi,
-                                       int ofs_unit, int type)
+static unsigned int check_prefree_segments(struct f2fs_sb_info *sbi, int type)
  {
         struct dirty_seglist_info *dirty_i = DIRTY_I(sbi);
         unsigned long *prefree_segmap = dirty_i->dirty_segmap[PRE];
-       unsigned int segno, next_segno, i;
-       int ofs = 0;
+       unsigned int segno;
+       unsigned int ofs = 0;
  
         /*
          * If there is not enough reserved sections,
@@ -318,28 +332,46 @@ static unsigned int check_prefree_segments(struct f2fs_sb_info *sbi,
         if (IS_NODESEG(type))
                 return NULL_SEGNO;
  next:
-       segno = find_next_bit(prefree_segmap, TOTAL_SEGS(sbi), ofs++);
-       ofs = ((segno / ofs_unit) * ofs_unit) + ofs_unit;
+       segno = find_next_bit(prefree_segmap, TOTAL_SEGS(sbi), ofs);
+       ofs += sbi->segs_per_sec;
+
         if (segno < TOTAL_SEGS(sbi)) {
+               int i;
+
                 /* skip intermediate segments in a section */
-               if (segno % ofs_unit)
+               if (segno % sbi->segs_per_sec)
                         goto next;
  
-               /* skip if whole section is not prefree */
-               next_segno = find_next_zero_bit(prefree_segmap,
-                                               TOTAL_SEGS(sbi), segno + 1);
-               if (next_segno - segno < ofs_unit)
+               /* skip if the section is currently used */
+               if (sec_usage_check(sbi, GET_SECNO(sbi, segno)))
                         goto next;
  
+               /* skip if whole section is not prefree */
+               for (i = 1; i < sbi->segs_per_sec; i++)
+                       if (!test_bit(segno + i, prefree_segmap))
+                               goto next;
+
                 /* skip if whole section was not free at the last checkpoint */
-               for (i = 0; i < ofs_unit; i++)
-                       if (get_seg_entry(sbi, segno)->ckpt_valid_blocks)
+               for (i = 0; i < sbi->segs_per_sec; i++)
+                       if (get_seg_entry(sbi, segno + i)->ckpt_valid_blocks)
                                 goto next;
+
                 return segno;
         }
         return NULL_SEGNO;
  }
  
+static int is_next_segment_free(struct f2fs_sb_info *sbi, int type)
+{
+       struct curseg_info *curseg = CURSEG_I(sbi, type);
+       unsigned int segno = curseg->segno;
+       struct free_segmap_info *free_i = FREE_I(sbi);
+
+       if (segno + 1 < TOTAL_SEGS(sbi) && (segno + 1) % sbi->segs_per_sec)
+               return !test_bit(segno + 1, free_i->free_segmap);
+       return 0;
+}
+
  /*
   * Find a new segment from the free segments bitmap to right order
   * This function should be returned with success, otherwise BUG
@@ -348,9 +380,8 @@ static void get_new_segment(struct f2fs_sb_info *sbi,
                         unsigned int *newseg, bool new_sec, int dir)
  {
         struct free_segmap_info *free_i = FREE_I(sbi);
-       unsigned int total_secs = sbi->total_sections;
         unsigned int segno, secno, zoneno;
-       unsigned int total_zones = sbi->total_sections / sbi->secs_per_zone;
+       unsigned int total_zones = TOTAL_SECS(sbi) / sbi->secs_per_zone;
         unsigned int hint = *newseg / sbi->segs_per_sec;
         unsigned int old_zoneno = GET_ZONENO_FROM_SEGNO(sbi, *newseg);
         unsigned int left_start = hint;
@@ -363,16 +394,17 @@ static void get_new_segment(struct f2fs_sb_info *sbi,
         if (!new_sec && ((*newseg + 1) % sbi->segs_per_sec)) {
                 segno = find_next_zero_bit(free_i->free_segmap,
                                         TOTAL_SEGS(sbi), *newseg + 1);
-               if (segno < TOTAL_SEGS(sbi))
+               if (segno - *newseg < sbi->segs_per_sec -
+                                       (*newseg % sbi->segs_per_sec))
                         goto got_it;
         }
  find_other_zone:
-       secno = find_next_zero_bit(free_i->free_secmap, total_secs, hint);
-       if (secno >= total_secs) {
+       secno = find_next_zero_bit(free_i->free_secmap, TOTAL_SECS(sbi), hint);
+       if (secno >= TOTAL_SECS(sbi)) {
                 if (dir == ALLOC_RIGHT) {
                         secno = find_next_zero_bit(free_i->free_secmap,
-                                               total_secs, 0);
-                       BUG_ON(secno >= total_secs);
+                                                       TOTAL_SECS(sbi), 0);
+                       BUG_ON(secno >= TOTAL_SECS(sbi));
                 } else {
                         go_left = 1;
                         left_start = hint - 1;
@@ -387,8 +419,8 @@ find_other_zone:
                         continue;
                 }
                 left_start = find_next_zero_bit(free_i->free_secmap,
-                                               total_secs, 0);
-               BUG_ON(left_start >= total_secs);
+                                                       TOTAL_SECS(sbi), 0);
+               BUG_ON(left_start >= TOTAL_SECS(sbi));
                 break;
         }
         secno = left_start;
@@ -561,20 +593,20 @@ static void allocate_segment_by_default(struct f2fs_sb_info *sbi,
                                                 int type, bool force)
  {
         struct curseg_info *curseg = CURSEG_I(sbi, type);
-       unsigned int ofs_unit;
  
         if (force) {
                 new_curseg(sbi, type, true);
                 goto out;
         }
  
-       ofs_unit = need_SSR(sbi) ? 1 : sbi->segs_per_sec;
-       curseg->next_segno = check_prefree_segments(sbi, ofs_unit, type);
+       curseg->next_segno = check_prefree_segments(sbi, type);
  
         if (curseg->next_segno != NULL_SEGNO)
                 change_curseg(sbi, type, false);
         else if (type == CURSEG_WARM_NODE)
                 new_curseg(sbi, type, false);
+       else if (curseg->alloc_type == LFS && is_next_segment_free(sbi, type))
+               new_curseg(sbi, type, false);
         else if (need_SSR(sbi) && get_ssr_segment(sbi, type))
                 change_curseg(sbi, type, true);
         else
@@ -656,10 +688,16 @@ static void do_submit_bio(struct f2fs_sb_info *sbi,
         if (type >= META_FLUSH)
                 rw = WRITE_FLUSH_FUA;
  
+       if (btype == META)
+               rw |= REQ_META;
+
         if (sbi->bio[btype]) {
                 struct bio_private *p = sbi->bio[btype]->bi_private;
                 p->sbi = sbi;
                 sbi->bio[btype]->bi_end_io = f2fs_end_io_write;
+
+               trace_f2fs_do_submit_bio(sbi->sb, btype, sync, sbi->bio[btype]);
+
                 if (type == META_FLUSH) {
                         DECLARE_COMPLETION_ONSTACK(wait);
                         p->is_sync = true;
@@ -696,7 +734,7 @@ static void submit_write_page(struct f2fs_sb_info *sbi, struct page *page,
                 do_submit_bio(sbi, type, false);
  alloc_new:
         if (sbi->bio[type] == NULL) {
-               sbi->bio[type] = f2fs_bio_alloc(bdev, bio_get_nr_vecs(bdev));
+               sbi->bio[type] = f2fs_bio_alloc(bdev, max_hw_blocks(sbi));
                 sbi->bio[type]->bi_sector = SECTOR_FROM_BLOCK(sbi, blk_addr);
                 /*
                  * The end_io will be assigned at the sumbission phase.
@@ -714,6 +752,7 @@ alloc_new:
         sbi->last_block_in_bio[type] = blk_addr;
  
         up_write(&sbi->bio_sem);
+       trace_f2fs_submit_write_page(page, blk_addr, type);
  }
  
  static bool __has_curseg_space(struct f2fs_sb_info *sbi, int type)
@@ -1390,7 +1429,7 @@ static int build_sit_info(struct f2fs_sb_info *sbi)
         }
  
         if (sbi->segs_per_sec > 1) {
-               sit_i->sec_entries = vzalloc(sbi->total_sections *
+               sit_i->sec_entries = vzalloc(TOTAL_SECS(sbi) *
                                         sizeof(struct sec_entry));
                 if (!sit_i->sec_entries)
                         return -ENOMEM;
@@ -1403,10 +1442,9 @@ static int build_sit_info(struct f2fs_sb_info *sbi)
         bitmap_size = __bitmap_size(sbi, SIT_BITMAP);
         src_bitmap = __bitmap_ptr(sbi, SIT_BITMAP);
  
-       dst_bitmap = kzalloc(bitmap_size, GFP_KERNEL);
+       dst_bitmap = kmemdup(src_bitmap, bitmap_size, GFP_KERNEL);
         if (!dst_bitmap)
                 return -ENOMEM;
-       memcpy(dst_bitmap, src_bitmap, bitmap_size);
  
         /* init SIT information */
         sit_i->s_ops = &default_salloc_ops;
@@ -1442,7 +1480,7 @@ static int build_free_segmap(struct f2fs_sb_info *sbi)
         if (!free_i->free_segmap)
                 return -ENOMEM;
  
-       sec_bitmap_size = f2fs_bitmap_size(sbi->total_sections);
+       sec_bitmap_size = f2fs_bitmap_size(TOTAL_SECS(sbi));
         free_i->free_secmap = kmalloc(sec_bitmap_size, GFP_KERNEL);
         if (!free_i->free_secmap)
                 return -ENOMEM;
@@ -1559,14 +1597,13 @@ static void init_dirty_segmap(struct f2fs_sb_info *sbi)
         }
  }
  
-static int init_victim_segmap(struct f2fs_sb_info *sbi)
+static int init_victim_secmap(struct f2fs_sb_info *sbi)
  {
         struct dirty_seglist_info *dirty_i = DIRTY_I(sbi);
-       unsigned int bitmap_size = f2fs_bitmap_size(TOTAL_SEGS(sbi));
+       unsigned int bitmap_size = f2fs_bitmap_size(TOTAL_SECS(sbi));
  
-       dirty_i->victim_segmap[FG_GC] = kzalloc(bitmap_size, GFP_KERNEL);
-       dirty_i->victim_segmap[BG_GC] = kzalloc(bitmap_size, GFP_KERNEL);
-       if (!dirty_i->victim_segmap[FG_GC] || !dirty_i->victim_segmap[BG_GC])
+       dirty_i->victim_secmap = kzalloc(bitmap_size, GFP_KERNEL);
+       if (!dirty_i->victim_secmap)
                 return -ENOMEM;
         return 0;
  }
@@ -1593,7 +1630,7 @@ static int build_dirty_segmap(struct f2fs_sb_info *sbi)
         }
  
         init_dirty_segmap(sbi);
-       return init_victim_segmap(sbi);
+       return init_victim_secmap(sbi);
  }
  
  /*
@@ -1680,18 +1717,10 @@ static void discard_dirty_segmap(struct f2fs_sb_info *sbi,
         mutex_unlock(&dirty_i->seglist_lock);
  }
  
-void reset_victim_segmap(struct f2fs_sb_info *sbi)
-{
-       unsigned int bitmap_size = f2fs_bitmap_size(TOTAL_SEGS(sbi));
-       memset(DIRTY_I(sbi)->victim_segmap[FG_GC], 0, bitmap_size);
-}
-
-static void destroy_victim_segmap(struct f2fs_sb_info *sbi)
+static void destroy_victim_secmap(struct f2fs_sb_info *sbi)
  {
         struct dirty_seglist_info *dirty_i = DIRTY_I(sbi);
-
-       kfree(dirty_i->victim_segmap[FG_GC]);
-       kfree(dirty_i->victim_segmap[BG_GC]);
+       kfree(dirty_i->victim_secmap);
  }
  
  static void destroy_dirty_segmap(struct f2fs_sb_info *sbi)
@@ -1706,7 +1735,7 @@ static void destroy_dirty_segmap(struct f2fs_sb_info *sbi)
         for (i = 0; i < NR_DIRTY_TYPE; i++)
                 discard_dirty_segmap(sbi, i);
  
-       destroy_victim_segmap(sbi);
+       destroy_victim_secmap(sbi);
         SM_I(sbi)->dirty_info = NULL;
         kfree(dirty_i);
  }
diff --git a/fs/f2fs/segment.h b/fs/f2fs/segment.h

index 552dadbb23272e8b2c3e10a04cfa3d0c0262de2d..062424a0e4c3a1ab912e0008c1eaa836f29af85f 100644 (file)
--- a/fs/f2fs/segment.h
+++ b/fs/f2fs/segment.h
@@ -8,10 +8,13 @@
   * it under the terms of the GNU General Public License version 2 as
   * published by the Free Software Foundation.
   */
+#include <linux/blkdev.h>
+
  /* constant macro */
  #define NULL_SEGNO                     ((unsigned int)(~0))
+#define NULL_SECNO                     ((unsigned int)(~0))
  
-/* V: Logical segment # in volume, R: Relative segment # in main area */
+/* L: Logical segment # in volume, R: Relative segment # in main area */
  #define GET_L2R_SEGNO(free_i, segno)   (segno - free_i->start_segno)
  #define GET_R2L_SEGNO(free_i, segno)   (segno + free_i->start_segno)
  
@@ -23,13 +26,13 @@
         ((t == CURSEG_HOT_NODE) || (t == CURSEG_COLD_NODE) ||           \
         (t == CURSEG_WARM_NODE))
  
-#define IS_CURSEG(sbi, segno)                                          \
-       ((segno == CURSEG_I(sbi, CURSEG_HOT_DATA)->segno) ||    \
-        (segno == CURSEG_I(sbi, CURSEG_WARM_DATA)->segno) ||   \
-        (segno == CURSEG_I(sbi, CURSEG_COLD_DATA)->segno) ||   \
-        (segno == CURSEG_I(sbi, CURSEG_HOT_NODE)->segno) ||    \
-        (segno == CURSEG_I(sbi, CURSEG_WARM_NODE)->segno) ||   \
-        (segno == CURSEG_I(sbi, CURSEG_COLD_NODE)->segno))
+#define IS_CURSEG(sbi, seg)                                            \
+       ((seg == CURSEG_I(sbi, CURSEG_HOT_DATA)->segno) ||      \
+        (seg == CURSEG_I(sbi, CURSEG_WARM_DATA)->segno) ||     \
+        (seg == CURSEG_I(sbi, CURSEG_COLD_DATA)->segno) ||     \
+        (seg == CURSEG_I(sbi, CURSEG_HOT_NODE)->segno) ||      \
+        (seg == CURSEG_I(sbi, CURSEG_WARM_NODE)->segno) ||     \
+        (seg == CURSEG_I(sbi, CURSEG_COLD_NODE)->segno))
  
  #define IS_CURSEC(sbi, secno)                                          \
         ((secno == CURSEG_I(sbi, CURSEG_HOT_DATA)->segno /              \
@@ -81,9 +84,12 @@
  #define f2fs_bitmap_size(nr)                   \
         (BITS_TO_LONGS(nr) * sizeof(unsigned long))
  #define TOTAL_SEGS(sbi)        (SM_I(sbi)->main_segments)
+#define TOTAL_SECS(sbi)        (sbi->total_sections)
  
  #define SECTOR_FROM_BLOCK(sbi, blk_addr)                               \
         (blk_addr << ((sbi)->log_blocksize - F2FS_LOG_SECTOR_SIZE))
+#define SECTOR_TO_BLOCK(sbi, sectors)                                  \
+       (sectors >> ((sbi)->log_blocksize - F2FS_LOG_SECTOR_SIZE))
  
  /* during checkpoint, bio_private is used to synchronize the last bio */
  struct bio_private {
@@ -213,7 +219,7 @@ struct dirty_seglist_info {
         unsigned long *dirty_segmap[NR_DIRTY_TYPE];
         struct mutex seglist_lock;              /* lock for segment bitmaps */
         int nr_dirty[NR_DIRTY_TYPE];            /* # of dirty segments */
-       unsigned long *victim_segmap[2];        /* BG_GC, FG_GC */
+       unsigned long *victim_secmap;           /* background GC victims */
  };
  
  /* victim selection function for cleaning and SSR */
@@ -464,8 +470,7 @@ static inline bool has_not_enough_free_secs(struct f2fs_sb_info *sbi, int freed)
  
  static inline int utilization(struct f2fs_sb_info *sbi)
  {
-       return (long int)valid_user_blocks(sbi) * 100 /
-                       (long int)sbi->user_block_count;
+       return div_u64(valid_user_blocks(sbi) * 100, sbi->user_block_count);
  }
  
  /*
@@ -616,3 +621,17 @@ static inline block_t sum_blk_addr(struct f2fs_sb_info *sbi, int base, int type)
                 le32_to_cpu(F2FS_CKPT(sbi)->cp_pack_total_block_count)
                                 - (base + 1) + type;
  }
+
+static inline bool sec_usage_check(struct f2fs_sb_info *sbi, unsigned int secno)
+{
+       if (IS_CURSEC(sbi, secno) || (sbi->cur_victim_sec == secno))
+               return true;
+       return false;
+}
+
+static inline unsigned int max_hw_blocks(struct f2fs_sb_info *sbi)
+{
+       struct block_device *bdev = sbi->sb->s_bdev;
+       struct request_queue *q = bdev_get_queue(bdev);
+       return SECTOR_TO_BLOCK(sbi, queue_max_sectors(q));
+}
diff --git a/fs/f2fs/super.c b/fs/f2fs/super.c

index 62e017743af6285bc7cdebbe692095b0d71c67d2..8555f7df82c796720c2c85ce0bd0af6248efa1c2 100644 (file)
--- a/fs/f2fs/super.c
+++ b/fs/f2fs/super.c
@@ -12,7 +12,6 @@
  #include <linux/init.h>
  #include <linux/fs.h>
  #include <linux/statfs.h>
-#include <linux/proc_fs.h>
  #include <linux/buffer_head.h>
  #include <linux/backing-dev.h>
  #include <linux/kthread.h>
@@ -21,12 +20,17 @@
  #include <linux/seq_file.h>
  #include <linux/random.h>
  #include <linux/exportfs.h>
+#include <linux/blkdev.h>
  #include <linux/f2fs_fs.h>
  
  #include "f2fs.h"
  #include "node.h"
+#include "segment.h"
  #include "xattr.h"
  
+#define CREATE_TRACE_POINTS
+#include <trace/events/f2fs.h>
+
  static struct kmem_cache *f2fs_inode_cachep;
  
  enum {
@@ -94,6 +98,20 @@ static struct inode *f2fs_alloc_inode(struct super_block *sb)
         return &fi->vfs_inode;
  }
  
+static int f2fs_drop_inode(struct inode *inode)
+{
+       /*
+        * This is to avoid a deadlock condition like below.
+        * writeback_single_inode(inode)
+        *  - f2fs_write_data_page
+        *    - f2fs_gc -> iput -> evict
+        *       - inode_wait_for_writeback(inode)
+        */
+       if (!inode_unhashed(inode) && inode->i_state & I_SYNC)
+               return 0;
+       return generic_drop_inode(inode);
+}
+
  static void f2fs_i_callback(struct rcu_head *head)
  {
         struct inode *inode = container_of(head, struct inode, i_rcu);
@@ -132,13 +150,18 @@ int f2fs_sync_fs(struct super_block *sb, int sync)
  {
         struct f2fs_sb_info *sbi = F2FS_SB(sb);
  
+       trace_f2fs_sync_fs(sb, sync);
+
         if (!sbi->s_dirty && !get_pages(sbi, F2FS_DIRTY_NODES))
                 return 0;
  
-       if (sync)
+       if (sync) {
+               mutex_lock(&sbi->gc_mutex);
                 write_checkpoint(sbi, false);
-       else
+               mutex_unlock(&sbi->gc_mutex);
+       } else {
                 f2fs_balance_fs(sbi);
+       }
  
         return 0;
  }
@@ -180,7 +203,7 @@ static int f2fs_statfs(struct dentry *dentry, struct kstatfs *buf)
         buf->f_files = sbi->total_node_count;
         buf->f_ffree = sbi->total_node_count - valid_inode_count(sbi);
  
-       buf->f_namelen = F2FS_MAX_NAME_LEN;
+       buf->f_namelen = F2FS_NAME_LEN;
         buf->f_fsid.val[0] = (u32)id;
         buf->f_fsid.val[1] = (u32)(id >> 32);
  
@@ -223,6 +246,7 @@ static int f2fs_show_options(struct seq_file *seq, struct dentry *root)
  
  static struct super_operations f2fs_sops = {
         .alloc_inode    = f2fs_alloc_inode,
+       .drop_inode     = f2fs_drop_inode,
         .destroy_inode  = f2fs_destroy_inode,
         .write_inode    = f2fs_write_inode,
         .show_options   = f2fs_show_options,
@@ -457,6 +481,7 @@ static void init_sb_info(struct f2fs_sb_info *sbi)
         sbi->root_ino_num = le32_to_cpu(raw_super->root_ino);
         sbi->node_ino_num = le32_to_cpu(raw_super->node_ino);
         sbi->meta_ino_num = le32_to_cpu(raw_super->meta_ino);
+       sbi->cur_victim_sec = NULL_SECNO;
  
         for (i = 0; i < NR_COUNT_TYPE; i++)
                 atomic_set(&sbi->nr_pages[i], 0);
@@ -473,7 +498,7 @@ static int validate_superblock(struct super_block *sb,
         if (!*raw_super_buf) {
                 f2fs_msg(sb, KERN_ERR, "unable to read %s superblock",
                                 super);
-               return 1;
+               return -EIO;
         }
  
         *raw_super = (struct f2fs_super_block *)
@@ -485,7 +510,7 @@ static int validate_superblock(struct super_block *sb,
  
         f2fs_msg(sb, KERN_ERR, "Can't find a valid F2FS filesystem "
                                 "in %s superblock", super);
-       return 1;
+       return -EINVAL;
  }
  
  static int f2fs_fill_super(struct super_block *sb, void *data, int silent)
@@ -508,9 +533,12 @@ static int f2fs_fill_super(struct super_block *sb, void *data, int silent)
                 goto free_sbi;
         }
  
-       if (validate_superblock(sb, &raw_super, &raw_super_buf, 0)) {
+       err = validate_superblock(sb, &raw_super, &raw_super_buf, 0);
+       if (err) {
                 brelse(raw_super_buf);
-               if (validate_superblock(sb, &raw_super, &raw_super_buf, 1))
+               /* check secondary superblock when primary failed */
+               err = validate_superblock(sb, &raw_super, &raw_super_buf, 1);
+               if (err)
                         goto free_sb_buf;
         }
         /* init some FS parameters */
@@ -525,7 +553,8 @@ static int f2fs_fill_super(struct super_block *sb, void *data, int silent)
         set_opt(sbi, POSIX_ACL);
  #endif
         /* parse mount options */
-       if (parse_options(sb, sbi, (char *)data))
+       err = parse_options(sb, sbi, (char *)data);
+       if (err)
                 goto free_sb_buf;
  
         sb->s_maxbytes = max_file_size(le32_to_cpu(raw_super->log_blocksize));
@@ -547,11 +576,11 @@ static int f2fs_fill_super(struct super_block *sb, void *data, int silent)
         sbi->raw_super = raw_super;
         sbi->raw_super_buf = raw_super_buf;
         mutex_init(&sbi->gc_mutex);
-       mutex_init(&sbi->write_inode);
         mutex_init(&sbi->writepages);
         mutex_init(&sbi->cp_mutex);
-       for (i = 0; i < NR_LOCK_TYPE; i++)
+       for (i = 0; i < NR_GLOBAL_LOCKS; i++)
                 mutex_init(&sbi->fs_lock[i]);
+       mutex_init(&sbi->node_write);
         sbi->por_doing = 0;
         spin_lock_init(&sbi->stat_lock);
         init_rwsem(&sbi->bio_sem);
@@ -638,8 +667,12 @@ static int f2fs_fill_super(struct super_block *sb, void *data, int silent)
         }
  
         /* recover fsynced data */
-       if (!test_opt(sbi, DISABLE_ROLL_FORWARD))
-               recover_fsync_data(sbi);
+       if (!test_opt(sbi, DISABLE_ROLL_FORWARD)) {
+               err = recover_fsync_data(sbi);
+               if (err)
+                       f2fs_msg(sb, KERN_ERR,
+                               "Cannot recover all fsync data errno=%ld", err);
+       }
  
         /* After POR, we can run background GC thread */
         err = start_gc_thread(sbi);
@@ -650,6 +683,14 @@ static int f2fs_fill_super(struct super_block *sb, void *data, int silent)
         if (err)
                 goto fail;
  
+       if (test_opt(sbi, DISCARD)) {
+               struct request_queue *q = bdev_get_queue(sb->s_bdev);
+               if (!blk_queue_discard(q))
+                       f2fs_msg(sb, KERN_WARNING,
+                                       "mounting with \"discard\" option, but "
+                                       "the device does not support discard");
+       }
+
         return 0;
  fail:
         stop_gc_thread(sbi);
diff --git a/fs/f2fs/xattr.c b/fs/f2fs/xattr.c

index 8038c049650473bd260d50006ca75175ec90fa62..0b02dce313565d5f287b96f464efca6cad0ea2cd 100644 (file)
--- a/fs/f2fs/xattr.c
+++ b/fs/f2fs/xattr.c
@@ -307,27 +307,30 @@ int f2fs_setxattr(struct inode *inode, int name_index, const char *name,
         int error, found, free, newsize;
         size_t name_len;
         char *pval;
+       int ilock;
  
         if (name == NULL)
                 return -EINVAL;
-       name_len = strlen(name);
  
         if (value == NULL)
                 value_len = 0;
  
-       if (name_len > 255 || value_len > MAX_VALUE_LEN)
+       name_len = strlen(name);
+
+       if (name_len > F2FS_NAME_LEN || value_len > MAX_VALUE_LEN)
                 return -ERANGE;
  
         f2fs_balance_fs(sbi);
  
-       mutex_lock_op(sbi, NODE_NEW);
+       ilock = mutex_lock_op(sbi);
+
         if (!fi->i_xattr_nid) {
                 /* Allocate new attribute block */
                 struct dnode_of_data dn;
  
                 if (!alloc_nid(sbi, &fi->i_xattr_nid)) {
-                       mutex_unlock_op(sbi, NODE_NEW);
-                       return -ENOSPC;
+                       error = -ENOSPC;
+                       goto exit;
                 }
                 set_new_dnode(&dn, inode, NULL, NULL, fi->i_xattr_nid);
                 mark_inode_dirty(inode);
@@ -336,8 +339,8 @@ int f2fs_setxattr(struct inode *inode, int name_index, const char *name,
                 if (IS_ERR(page)) {
                         alloc_nid_failed(sbi, fi->i_xattr_nid);
                         fi->i_xattr_nid = 0;
-                       mutex_unlock_op(sbi, NODE_NEW);
-                       return PTR_ERR(page);
+                       error = PTR_ERR(page);
+                       goto exit;
                 }
  
                 alloc_nid_done(sbi, fi->i_xattr_nid);
@@ -349,8 +352,8 @@ int f2fs_setxattr(struct inode *inode, int name_index, const char *name,
                 /* The inode already has an extended attribute block. */
                 page = get_node_page(sbi, fi->i_xattr_nid);
                 if (IS_ERR(page)) {
-                       mutex_unlock_op(sbi, NODE_NEW);
-                       return PTR_ERR(page);
+                       error = PTR_ERR(page);
+                       goto exit;
                 }
  
                 base_addr = page_address(page);
@@ -432,12 +435,13 @@ int f2fs_setxattr(struct inode *inode, int name_index, const char *name,
                 inode->i_ctime = CURRENT_TIME;
                 clear_inode_flag(fi, FI_ACL_MODE);
         }
-       f2fs_write_inode(inode, NULL);
-       mutex_unlock_op(sbi, NODE_NEW);
+       update_inode_page(inode);
+       mutex_unlock_op(sbi, ilock);
  
         return 0;
  cleanup:
         f2fs_put_page(page, 1);
-       mutex_unlock_op(sbi, NODE_NEW);
+exit:
+       mutex_unlock_op(sbi, ilock);
         return error;
  }
diff --git a/fs/fat/inode.c b/fs/fat/inode.c

index 4ff901632b26a194790054d0f29a94f633c6f8c3..dfce656ddb333a6e7720c8ab0e32984ad82f11f3 100644 (file)
--- a/fs/fat/inode.c
+++ b/fs/fat/inode.c
@@ -19,6 +19,7 @@
  #include <linux/mpage.h>
  #include <linux/buffer_head.h>
  #include <linux/mount.h>
+#include <linux/aio.h>
  #include <linux/vfs.h>
  #include <linux/parser.h>
  #include <linux/uio.h>
diff --git a/fs/fs-writeback.c b/fs/fs-writeback.c

index 798d4458a4d3a5798a7b04858d82a5f031849bab..3be57189efd5b3a8005321f02e40971af9429cf6 100644 (file)
--- a/fs/fs-writeback.c
+++ b/fs/fs-writeback.c
@@ -22,7 +22,6 @@
  #include <linux/mm.h>
  #include <linux/pagemap.h>
  #include <linux/kthread.h>
-#include <linux/freezer.h>
  #include <linux/writeback.h>
  #include <linux/blkdev.h>
  #include <linux/backing-dev.h>
@@ -88,20 +87,6 @@ static inline struct inode *wb_inode(struct list_head *head)
  #define CREATE_TRACE_POINTS
  #include <trace/events/writeback.h>
  
-/* Wakeup flusher thread or forker thread to fork it. Requires bdi->wb_lock. */
-static void bdi_wakeup_flusher(struct backing_dev_info *bdi)
-{
-       if (bdi->wb.task) {
-               wake_up_process(bdi->wb.task);
-       } else {
-               /*
-                * The bdi thread isn't there, wake up the forker thread which
-                * will create and run it.
-                */
-               wake_up_process(default_backing_dev_info.wb.task);
-       }
-}
-
  static void bdi_queue_work(struct backing_dev_info *bdi,
                            struct wb_writeback_work *work)
  {
@@ -109,10 +94,9 @@ static void bdi_queue_work(struct backing_dev_info *bdi,
  
         spin_lock_bh(&bdi->wb_lock);
         list_add_tail(&work->list, &bdi->work_list);
-       if (!bdi->wb.task)
-               trace_writeback_nothread(bdi, work);
-       bdi_wakeup_flusher(bdi);
         spin_unlock_bh(&bdi->wb_lock);
+
+       mod_delayed_work(bdi_wq, &bdi->wb.dwork, 0);
  }
  
  static void
@@ -127,10 +111,8 @@ __bdi_start_writeback(struct backing_dev_info *bdi, long nr_pages,
          */
         work = kzalloc(sizeof(*work), GFP_ATOMIC);
         if (!work) {
-               if (bdi->wb.task) {
-                       trace_writeback_nowork(bdi);
-                       wake_up_process(bdi->wb.task);
-               }
+               trace_writeback_nowork(bdi);
+               mod_delayed_work(bdi_wq, &bdi->wb.dwork, 0);
                 return;
         }
  
@@ -177,9 +159,7 @@ void bdi_start_background_writeback(struct backing_dev_info *bdi)
          * writeback as soon as there is no other work to do.
          */
         trace_writeback_wake_background(bdi);
-       spin_lock_bh(&bdi->wb_lock);
-       bdi_wakeup_flusher(bdi);
-       spin_unlock_bh(&bdi->wb_lock);
+       mod_delayed_work(bdi_wq, &bdi->wb.dwork, 0);
  }
  
  /*
@@ -1020,67 +1000,49 @@ long wb_do_writeback(struct bdi_writeback *wb, int force_wait)
  
  /*
   * Handle writeback of dirty data for the device backed by this bdi. Also
- * wakes up periodically and does kupdated style flushing.
+ * reschedules periodically and does kupdated style flushing.
   */
-int bdi_writeback_thread(void *data)
+void bdi_writeback_workfn(struct work_struct *work)
  {
-       struct bdi_writeback *wb = data;
+       struct bdi_writeback *wb = container_of(to_delayed_work(work),
+                                               struct bdi_writeback, dwork);
         struct backing_dev_info *bdi = wb->bdi;
         long pages_written;
  
         set_worker_desc("flush-%s", dev_name(bdi->dev));
         current->flags |= PF_SWAPWRITE;
-       set_freezable();
-       wb->last_active = jiffies;
-
-       /*
-        * Our parent may run at a different priority, just set us to normal
-        */
-       set_user_nice(current, 0);
-
-       trace_writeback_thread_start(bdi);
  
-       while (!kthread_freezable_should_stop(NULL)) {
+       if (likely(!current_is_workqueue_rescuer() ||
+                  list_empty(&bdi->bdi_list))) {
                 /*
-                * Remove own delayed wake-up timer, since we are already awake
-                * and we'll take care of the periodic write-back.
+                * The normal path.  Keep writing back @bdi until its
+                * work_list is empty.  Note that this path is also taken
+                * if @bdi is shutting down even when we're running off the
+                * rescuer as work_list needs to be drained.
                  */
-               del_timer(&wb->wakeup_timer);
-
-               pages_written = wb_do_writeback(wb, 0);
-
+               do {
+                       pages_written = wb_do_writeback(wb, 0);
+                       trace_writeback_pages_written(pages_written);
+               } while (!list_empty(&bdi->work_list));
+       } else {
+               /*
+                * bdi_wq can't get enough workers and we're running off
+                * the emergency worker.  Don't hog it.  Hopefully, 1024 is
+                * enough for efficient IO.
+                */
+               pages_written = writeback_inodes_wb(&bdi->wb, 1024,
+                                                   WB_REASON_FORKER_THREAD);
                 trace_writeback_pages_written(pages_written);
-
-               if (pages_written)
-                       wb->last_active = jiffies;
-
-               set_current_state(TASK_INTERRUPTIBLE);
-               if (!list_empty(&bdi->work_list) || kthread_should_stop()) {
-                       __set_current_state(TASK_RUNNING);
-                       continue;
-               }
-
-               if (wb_has_dirty_io(wb) && dirty_writeback_interval)
-                       schedule_timeout(msecs_to_jiffies(dirty_writeback_interval * 10));
-               else {
-                       /*
-                        * We have nothing to do, so can go sleep without any
-                        * timeout and save power. When a work is queued or
-                        * something is made dirty - we will be woken up.
-                        */
-                       schedule();
-               }
         }
  
-       /* Flush any work that raced with us exiting */
-       if (!list_empty(&bdi->work_list))
-               wb_do_writeback(wb, 1);
+       if (!list_empty(&bdi->work_list) ||
+           (wb_has_dirty_io(wb) && dirty_writeback_interval))
+               queue_delayed_work(bdi_wq, &wb->dwork,
+                       msecs_to_jiffies(dirty_writeback_interval * 10));
  
-       trace_writeback_thread_stop(bdi);
-       return 0;
+       current->flags &= ~PF_SWAPWRITE;
  }
  
-
  /*
   * Start writeback of `nr_pages' pages.  If `nr_pages' is zero, write back
   * the whole world.
diff --git a/fs/fuse/cuse.c b/fs/fuse/cuse.c

index b3aaf7b3578bb068aee5df3a35ced44976d33bfc..aef34b1e635e9a424659e721a8a891a3084b8bf4 100644 (file)
--- a/fs/fuse/cuse.c
+++ b/fs/fuse/cuse.c
@@ -38,6 +38,7 @@
  #include <linux/device.h>
  #include <linux/file.h>
  #include <linux/fs.h>
+#include <linux/aio.h>
  #include <linux/kdev_t.h>
  #include <linux/kthread.h>
  #include <linux/list.h>
diff --git a/fs/fuse/dev.c b/fs/fuse/dev.c

index a6c1664e330b0e8610eb3faf3d14df6c8ed3eec2..1d55f94654000dbc8e8c0de37e0cb32471e3791a 100644 (file)
--- a/fs/fuse/dev.c
+++ b/fs/fuse/dev.c
@@ -19,6 +19,7 @@
  #include <linux/pipe_fs_i.h>
  #include <linux/swap.h>
  #include <linux/splice.h>
+#include <linux/aio.h>
  
  MODULE_ALIAS_MISCDEV(FUSE_MINOR);
  MODULE_ALIAS("devname:fuse");
diff --git a/fs/fuse/file.c b/fs/fuse/file.c

index 4655e59d545b88f7d1494652dd580968a1ff7572..d1c9b85b3f58bfbbc10919b6ee50e7b4ead9bb2f 100644 (file)
--- a/fs/fuse/file.c
+++ b/fs/fuse/file.c
@@ -15,6 +15,7 @@
  #include <linux/module.h>
  #include <linux/compat.h>
  #include <linux/swap.h>
+#include <linux/aio.h>
  
  static const struct file_operations fuse_direct_io_file_operations;
  
diff --git a/fs/gfs2/aops.c b/fs/gfs2/aops.c

index 9883694f1e7c491350a21e6f77e09dc3ee054d18..0bad69ed6336e2e1450862f90a886669c16057b3 100644 (file)
--- a/fs/gfs2/aops.c
+++ b/fs/gfs2/aops.c
@@ -20,6 +20,7 @@
  #include <linux/swap.h>
  #include <linux/gfs2_ondisk.h>
  #include <linux/backing-dev.h>
+#include <linux/aio.h>
  
  #include "gfs2.h"
  #include "incore.h"
diff --git a/fs/gfs2/file.c b/fs/gfs2/file.c

index d79c2dadc536662370c3c084b100f1cecd303855..acd16764b133aa896bac5bea77a9bb481e07c74c 100644 (file)
--- a/fs/gfs2/file.c
+++ b/fs/gfs2/file.c
@@ -25,6 +25,7 @@
  #include <asm/uaccess.h>
  #include <linux/dlm.h>
  #include <linux/dlm_plock.h>
+#include <linux/aio.h>
  
  #include "gfs2.h"
  #include "incore.h"
diff --git a/fs/gfs2/lops.c b/fs/gfs2/lops.c

index 7318abf9d0fb863165857fb8adc897ad0f5e7c9e..c5fa758fd8446e1938036be9cdedaf75e2bc552b 100644 (file)
--- a/fs/gfs2/lops.c
+++ b/fs/gfs2/lops.c
@@ -300,7 +300,7 @@ static struct bio *gfs2_log_get_bio(struct gfs2_sbd *sdp, u64 blkno)
         u64 nblk;
  
         if (bio) {
-               nblk = bio->bi_sector + bio_sectors(bio);
+               nblk = bio_end_sector(bio);
                 nblk >>= sdp->sd_fsb2bb_shift;
                 if (blkno == nblk)
                         return bio;
diff --git a/fs/hfs/dir.c b/fs/hfs/dir.c

index 17c22a8fd40ae1f8a2d59d031da8d9652ae8a36d..e0101b6fb0d73c4a9036a481f5aabd6ddbe213ce 100644 (file)
--- a/fs/hfs/dir.c
+++ b/fs/hfs/dir.c
@@ -176,7 +176,9 @@ static int hfs_dir_release(struct inode *inode, struct file *file)
  {
         struct hfs_readdir_data *rd = file->private_data;
         if (rd) {
+               mutex_lock(&inode->i_mutex);
                 list_del(&rd->list);
+               mutex_unlock(&inode->i_mutex);
                 kfree(rd);
         }
         return 0;
diff --git a/fs/hfs/inode.c b/fs/hfs/inode.c

index 716e1aafb2e24ee747ca15b877cae8200d674a0f..f9299d8a64e3a2af9f6ef2aadd009a097c74ccf6 100644 (file)
--- a/fs/hfs/inode.c
+++ b/fs/hfs/inode.c
@@ -14,6 +14,7 @@
  #include <linux/pagemap.h>
  #include <linux/mpage.h>
  #include <linux/sched.h>
+#include <linux/aio.h>
  
  #include "hfs_fs.h"
  #include "btree.h"
diff --git a/fs/hfsplus/inode.c b/fs/hfsplus/inode.c

index 7faaa964968eb7ffb9af14163d76f3e84dbbeb51..f833d35630abbd4d98c4ca322e32704d792cf9e9 100644 (file)
--- a/fs/hfsplus/inode.c
+++ b/fs/hfsplus/inode.c
@@ -14,6 +14,7 @@
  #include <linux/pagemap.h>
  #include <linux/mpage.h>
  #include <linux/sched.h>
+#include <linux/aio.h>
  
  #include "hfsplus_fs.h"
  #include "hfsplus_raw.h"
diff --git a/fs/hugetlbfs/inode.c b/fs/hugetlbfs/inode.c

index 523464e62849ccea755104d19e7ce39c2ca5fbfd..a3f868ae3fd48f043f91a059ede274b20b431c3b 100644 (file)
--- a/fs/hugetlbfs/inode.c
+++ b/fs/hugetlbfs/inode.c
@@ -909,11 +909,8 @@ static int can_do_hugetlb_shm(void)
  
  static int get_hstate_idx(int page_size_log)
  {
-       struct hstate *h;
+       struct hstate *h = hstate_sizelog(page_size_log);
  
-       if (!page_size_log)
-               return default_hstate_idx;
-       h = size_to_hstate(1 << page_size_log);
         if (!h)
                 return -1;
         return h - hstates;
@@ -929,9 +926,12 @@ static struct dentry_operations anon_ops = {
         .d_dname = hugetlb_dname
  };
  
-struct file *hugetlb_file_setup(const char *name, unsigned long addr,
-                               size_t size, vm_flags_t acctflag,
-                               struct user_struct **user,
+/*
+ * Note that size should be aligned to proper hugepage size in caller side,
+ * otherwise hugetlb_reserve_pages reserves one less hugepages than intended.
+ */
+struct file *hugetlb_file_setup(const char *name, size_t size,
+                               vm_flags_t acctflag, struct user_struct **user,
                                 int creat_flags, int page_size_log)
  {
         struct file *file = ERR_PTR(-ENOMEM);
@@ -939,8 +939,6 @@ struct file *hugetlb_file_setup(const char *name, unsigned long addr,
         struct path path;
         struct super_block *sb;
         struct qstr quick_string;
-       struct hstate *hstate;
-       unsigned long num_pages;
         int hstate_idx;
  
         hstate_idx = get_hstate_idx(page_size_log);
@@ -980,12 +978,10 @@ struct file *hugetlb_file_setup(const char *name, unsigned long addr,
         if (!inode)
                 goto out_dentry;
  
-       hstate = hstate_inode(inode);
-       size += addr & ~huge_page_mask(hstate);
-       num_pages = ALIGN(size, huge_page_size(hstate)) >>
-                       huge_page_shift(hstate);
         file = ERR_PTR(-ENOMEM);
-       if (hugetlb_reserve_pages(inode, 0, num_pages, NULL, acctflag))
+       if (hugetlb_reserve_pages(inode, 0,
+                       size >> huge_page_shift(hstate_inode(inode)), NULL,
+                       acctflag))
                 goto out_inode;
  
         d_instantiate(path.dentry, inode);
diff --git a/fs/jfs/inode.c b/fs/jfs/inode.c

index 77554b61d1247b2346c4c337d874281504e66f1c..730f24e282a652029ca14b0f5032411512914beb 100644 (file)
--- a/fs/jfs/inode.c
+++ b/fs/jfs/inode.c
@@ -23,6 +23,7 @@
  #include <linux/pagemap.h>
  #include <linux/quotaops.h>
  #include <linux/writeback.h>
+#include <linux/aio.h>
  #include "jfs_incore.h"
  #include "jfs_inode.h"
  #include "jfs_filsys.h"
diff --git a/fs/jfs/jfs_logmgr.c b/fs/jfs/jfs_logmgr.c

index cbe48ea9318eea1ce44cae4bf3190911edf6c4c4..c57499dca89c5a3910bcefc5af951179aa693f24 100644 (file)
--- a/fs/jfs/jfs_logmgr.c
+++ b/fs/jfs/jfs_logmgr.c
@@ -2005,7 +2005,6 @@ static int lbmRead(struct jfs_log * log, int pn, struct lbuf ** bpp)
         bio->bi_io_vec[0].bv_offset = bp->l_offset;
  
         bio->bi_vcnt = 1;
-       bio->bi_idx = 0;
         bio->bi_size = LOGPSIZE;
  
         bio->bi_end_io = lbmIODone;
@@ -2146,7 +2145,6 @@ static void lbmStartIO(struct lbuf * bp)
         bio->bi_io_vec[0].bv_offset = bp->l_offset;
  
         bio->bi_vcnt = 1;
-       bio->bi_idx = 0;
         bio->bi_size = LOGPSIZE;
  
         bio->bi_end_io = lbmIODone;
diff --git a/fs/logfs/dev_bdev.c b/fs/logfs/dev_bdev.c

index e784a217b50067919ad3ebffe559b3552b58a9bc..550475ca6a0e0ec35c82d90b10372f2e4434fe90 100644 (file)
--- a/fs/logfs/dev_bdev.c
+++ b/fs/logfs/dev_bdev.c
@@ -32,7 +32,6 @@ static int sync_request(struct page *page, struct block_device *bdev, int rw)
         bio_vec.bv_len = PAGE_SIZE;
         bio_vec.bv_offset = 0;
         bio.bi_vcnt = 1;
-       bio.bi_idx = 0;
         bio.bi_size = PAGE_SIZE;
         bio.bi_bdev = bdev;
         bio.bi_sector = page->index * (PAGE_SIZE >> 9);
@@ -108,7 +107,6 @@ static int __bdev_writeseg(struct super_block *sb, u64 ofs, pgoff_t index,
                 if (i >= max_pages) {
                         /* Block layer cannot split bios :( */
                         bio->bi_vcnt = i;
-                       bio->bi_idx = 0;
                         bio->bi_size = i * PAGE_SIZE;
                         bio->bi_bdev = super->s_bdev;
                         bio->bi_sector = ofs >> 9;
@@ -136,7 +134,6 @@ static int __bdev_writeseg(struct super_block *sb, u64 ofs, pgoff_t index,
                 unlock_page(page);
         }
         bio->bi_vcnt = nr_pages;
-       bio->bi_idx = 0;
         bio->bi_size = nr_pages * PAGE_SIZE;
         bio->bi_bdev = super->s_bdev;
         bio->bi_sector = ofs >> 9;
@@ -202,7 +199,6 @@ static int do_erase(struct super_block *sb, u64 ofs, pgoff_t index,
                 if (i >= max_pages) {
                         /* Block layer cannot split bios :( */
                         bio->bi_vcnt = i;
-                       bio->bi_idx = 0;
                         bio->bi_size = i * PAGE_SIZE;
                         bio->bi_bdev = super->s_bdev;
                         bio->bi_sector = ofs >> 9;
@@ -224,7 +220,6 @@ static int do_erase(struct super_block *sb, u64 ofs, pgoff_t index,
                 bio->bi_io_vec[i].bv_offset = 0;
         }
         bio->bi_vcnt = nr_pages;
-       bio->bi_idx = 0;
         bio->bi_size = nr_pages * PAGE_SIZE;
         bio->bi_bdev = super->s_bdev;
         bio->bi_sector = ofs >> 9;
diff --git a/fs/nfs/blocklayout/blocklayout.h b/fs/nfs/blocklayout/blocklayout.h

index f4891bde8851d8092012f06811496de724b2d5e4..8485978993e85bcbda9772b55a2ed26f5442a228 100644 (file)
--- a/fs/nfs/blocklayout/blocklayout.h
+++ b/fs/nfs/blocklayout/blocklayout.h
@@ -173,7 +173,7 @@ struct bl_msg_hdr {
  /* blocklayoutdev.c */
  ssize_t bl_pipe_downcall(struct file *, const char __user *, size_t);
  void bl_pipe_destroy_msg(struct rpc_pipe_msg *);
-int nfs4_blkdev_put(struct block_device *bdev);
+void nfs4_blkdev_put(struct block_device *bdev);
  struct pnfs_block_dev *nfs4_blk_decode_device(struct nfs_server *server,
                                                 struct pnfs_device *dev);
  int nfs4_blk_process_layoutget(struct pnfs_layout_hdr *lo,
diff --git a/fs/nfs/blocklayout/blocklayoutdev.c b/fs/nfs/blocklayout/blocklayoutdev.c

index a86c5bdad9e3119155889c37ae91052f7b9cc3ef..04303b5c93618b9ad9c73bbde74e4f64e64c684f 100644 (file)
--- a/fs/nfs/blocklayout/blocklayoutdev.c
+++ b/fs/nfs/blocklayout/blocklayoutdev.c
@@ -56,11 +56,11 @@ static int decode_sector_number(__be32 **rp, sector_t *sp)
  /*
   * Release the block device
   */
-int nfs4_blkdev_put(struct block_device *bdev)
+void nfs4_blkdev_put(struct block_device *bdev)
  {
         dprintk("%s for device %d:%d\n", __func__, MAJOR(bdev->bd_dev),
                         MINOR(bdev->bd_dev));
-       return blkdev_put(bdev, FMODE_READ);
+       blkdev_put(bdev, FMODE_READ);
  }
  
  ssize_t bl_pipe_downcall(struct file *filp, const char __user *src,
diff --git a/fs/nfs/blocklayout/blocklayoutdm.c b/fs/nfs/blocklayout/blocklayoutdm.c

index 6fc7b5cae92bf6526bee07696f12977322f2cb5e..8999cfddd866a352e0937676bfcb94b40f03d120 100644 (file)
--- a/fs/nfs/blocklayout/blocklayoutdm.c
+++ b/fs/nfs/blocklayout/blocklayoutdm.c
@@ -88,14 +88,8 @@ out:
   */
  static void nfs4_blk_metadev_release(struct pnfs_block_dev *bdev)
  {
-       int rv;
-
         dprintk("%s Releasing\n", __func__);
-       rv = nfs4_blkdev_put(bdev->bm_mdev);
-       if (rv)
-               printk(KERN_ERR "NFS: %s nfs4_blkdev_put returns %d\n",
-                               __func__, rv);
-
+       nfs4_blkdev_put(bdev->bm_mdev);
         dev_remove(bdev->net, bdev->bm_mdev->bd_dev);
  }
  
diff --git a/fs/nfs/nfs4_fs.h b/fs/nfs/nfs4_fs.h

index 553a83cc41061fdd18d8b125d5e7b9ffaea8e99e..a1dd768d0a350fd93498ed7a96f068a070df35b7 100644 (file)
--- a/fs/nfs/nfs4_fs.h
+++ b/fs/nfs/nfs4_fs.h
@@ -47,6 +47,8 @@ struct nfs4_minor_version_ops {
                         const nfs4_stateid *);
         int     (*find_root_sec)(struct nfs_server *, struct nfs_fh *,
                         struct nfs_fsinfo *);
+       int     (*free_lock_state)(struct nfs_server *,
+                       struct nfs4_lock_state *);
         const struct nfs4_state_recovery_ops *reboot_recovery_ops;
         const struct nfs4_state_recovery_ops *nograce_recovery_ops;
         const struct nfs4_state_maintenance_ops *state_renewal_ops;
@@ -234,7 +236,6 @@ extern int nfs4_proc_fs_locations(struct rpc_clnt *, struct inode *, const struc
  extern struct rpc_clnt *nfs4_proc_lookup_mountpoint(struct inode *, struct qstr *,
                             struct nfs_fh *, struct nfs_fattr *);
  extern int nfs4_proc_secinfo(struct inode *, const struct qstr *, struct nfs4_secinfo_flavors *);
-extern int nfs4_release_lockowner(struct nfs4_lock_state *);
  extern const struct xattr_handler *nfs4_xattr_handlers[];
  extern int nfs4_set_rw_stateid(nfs4_stateid *stateid,
                 const struct nfs_open_context *ctx,
diff --git a/fs/nfs/nfs4filelayout.h b/fs/nfs/nfs4filelayout.h

index b8da95548d3d7a10b409a70a852b0245eeb59b7a..235ff952d3c8620cfe9e5786abf1f3b41f364e84 100644 (file)
--- a/fs/nfs/nfs4filelayout.h
+++ b/fs/nfs/nfs4filelayout.h
@@ -70,6 +70,8 @@ struct nfs4_pnfs_ds {
         struct list_head        ds_addrs;
         struct nfs_client       *ds_clp;
         atomic_t                ds_count;
+       unsigned long           ds_state;
+#define NFS4DS_CONNECTING      0       /* ds is establishing connection */
  };
  
  struct nfs4_file_layout_dsaddr {
diff --git a/fs/nfs/nfs4filelayoutdev.c b/fs/nfs/nfs4filelayoutdev.c

index 1fe284f01f8b6419b3c7a9a5728ea5fc33cad583..661a0f6112156ce554be5124c4cadec2294ce6cb 100644 (file)
--- a/fs/nfs/nfs4filelayoutdev.c
+++ b/fs/nfs/nfs4filelayoutdev.c
@@ -775,6 +775,22 @@ nfs4_fl_select_ds_fh(struct pnfs_layout_segment *lseg, u32 j)
         return flseg->fh_array[i];
  }
  
+static void nfs4_wait_ds_connect(struct nfs4_pnfs_ds *ds)
+{
+       might_sleep();
+       wait_on_bit(&ds->ds_state, NFS4DS_CONNECTING,
+                       nfs_wait_bit_killable, TASK_KILLABLE);
+}
+
+static void nfs4_clear_ds_conn_bit(struct nfs4_pnfs_ds *ds)
+{
+       smp_mb__before_clear_bit();
+       clear_bit(NFS4DS_CONNECTING, &ds->ds_state);
+       smp_mb__after_clear_bit();
+       wake_up_bit(&ds->ds_state, NFS4DS_CONNECTING);
+}
+
+
  struct nfs4_pnfs_ds *
  nfs4_fl_prepare_ds(struct pnfs_layout_segment *lseg, u32 ds_idx)
  {
@@ -791,16 +807,22 @@ nfs4_fl_prepare_ds(struct pnfs_layout_segment *lseg, u32 ds_idx)
                 filelayout_mark_devid_invalid(devid);
                 return NULL;
         }
+       if (ds->ds_clp)
+               return ds;
  
-       if (!ds->ds_clp) {
+       if (test_and_set_bit(NFS4DS_CONNECTING, &ds->ds_state) == 0) {
                 struct nfs_server *s = NFS_SERVER(lseg->pls_layout->plh_inode);
                 int err;
  
                 err = nfs4_ds_connect(s, ds);
                 if (err) {
                         nfs4_mark_deviceid_unavailable(devid);
-                       return NULL;
+                       ds = NULL;
                 }
+               nfs4_clear_ds_conn_bit(ds);
+       } else {
+               /* Either ds is connected, or ds is NULL */
+               nfs4_wait_ds_connect(ds);
         }
         return ds;
  }
diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c

index 9da4bd55eb3019a964f8a7a7b0f89e2bab4c8b42..8fbc100541154cbd31253dc261eb5eaafbe71e29 100644 (file)
--- a/fs/nfs/nfs4proc.c
+++ b/fs/nfs/nfs4proc.c
@@ -4766,9 +4766,9 @@ static int nfs4_proc_unlck(struct nfs4_state *state, int cmd, struct file_lock *
         if (status != 0)
                 goto out;
         /* Is this a delegated lock? */
-       if (test_bit(NFS_DELEGATED_STATE, &state->flags))
-               goto out;
         lsp = request->fl_u.nfs4_fl.owner;
+       if (test_bit(NFS_LOCK_INITIALIZED, &lsp->ls_flags) == 0)
+               goto out;
         seqid = nfs_alloc_seqid(&lsp->ls_seqid, GFP_KERNEL);
         status = -ENOMEM;
         if (seqid == NULL)
@@ -5238,9 +5238,8 @@ static const struct rpc_call_ops nfs4_release_lockowner_ops = {
         .rpc_release = nfs4_release_lockowner_release,
  };
  
-int nfs4_release_lockowner(struct nfs4_lock_state *lsp)
+static int nfs4_release_lockowner(struct nfs_server *server, struct nfs4_lock_state *lsp)
  {
-       struct nfs_server *server = lsp->ls_state->owner->so_server;
         struct nfs_release_lockowner_data *data;
         struct rpc_message msg = {
                 .rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_RELEASE_LOCKOWNER],
@@ -6783,26 +6782,76 @@ static int nfs41_test_stateid(struct nfs_server *server, nfs4_stateid *stateid)
         return err;
  }
  
-static int _nfs4_free_stateid(struct nfs_server *server, nfs4_stateid *stateid)
-{
-       struct nfs41_free_stateid_args args = {
-               .stateid = stateid,
-       };
+struct nfs_free_stateid_data {
+       struct nfs_server *server;
+       struct nfs41_free_stateid_args args;
         struct nfs41_free_stateid_res res;
+};
+
+static void nfs41_free_stateid_prepare(struct rpc_task *task, void *calldata)
+{
+       struct nfs_free_stateid_data *data = calldata;
+       nfs41_setup_sequence(nfs4_get_session(data->server),
+                       &data->args.seq_args,
+                       &data->res.seq_res,
+                       task);
+}
+
+static void nfs41_free_stateid_done(struct rpc_task *task, void *calldata)
+{
+       struct nfs_free_stateid_data *data = calldata;
+
+       nfs41_sequence_done(task, &data->res.seq_res);
+
+       switch (task->tk_status) {
+       case -NFS4ERR_DELAY:
+               if (nfs4_async_handle_error(task, data->server, NULL) == -EAGAIN)
+                       rpc_restart_call_prepare(task);
+       }
+}
+
+static void nfs41_free_stateid_release(void *calldata)
+{
+       kfree(calldata);
+}
+
+const struct rpc_call_ops nfs41_free_stateid_ops = {
+       .rpc_call_prepare = nfs41_free_stateid_prepare,
+       .rpc_call_done = nfs41_free_stateid_done,
+       .rpc_release = nfs41_free_stateid_release,
+};
+
+static struct rpc_task *_nfs41_free_stateid(struct nfs_server *server,
+               nfs4_stateid *stateid,
+               bool privileged)
+{
         struct rpc_message msg = {
                 .rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_FREE_STATEID],
-               .rpc_argp = &args,
-               .rpc_resp = &res,
         };
-       int status;
+       struct rpc_task_setup task_setup = {
+               .rpc_client = server->client,
+               .rpc_message = &msg,
+               .callback_ops = &nfs41_free_stateid_ops,
+               .flags = RPC_TASK_ASYNC,
+       };
+       struct nfs_free_stateid_data *data;
  
         dprintk("NFS call  free_stateid %p\n", stateid);
-       nfs41_init_sequence(&args.seq_args, &res.seq_res, 0);
-       nfs4_set_sequence_privileged(&args.seq_args);
-       status = nfs4_call_sync_sequence(server->client, server, &msg,
-                       &args.seq_args, &res.seq_res);
-       dprintk("NFS reply free_stateid: %d\n", status);
-       return status;
+       data = kmalloc(sizeof(*data), GFP_NOFS);
+       if (!data)
+               return ERR_PTR(-ENOMEM);
+       data->server = server;
+       nfs4_stateid_copy(&data->args.stateid, stateid);
+
+       task_setup.callback_data = data;
+
+       msg.rpc_argp = &data->args;
+       msg.rpc_resp = &data->res;
+       nfs41_init_sequence(&data->args.seq_args, &data->res.seq_res, 0);
+       if (privileged)
+               nfs4_set_sequence_privileged(&data->args.seq_args);
+
+       return rpc_run_task(&task_setup);
  }
  
  /**
@@ -6816,15 +6865,29 @@ static int _nfs4_free_stateid(struct nfs_server *server, nfs4_stateid *stateid)
   */
  static int nfs41_free_stateid(struct nfs_server *server, nfs4_stateid *stateid)
  {
-       struct nfs4_exception exception = { };
-       int err;
-       do {
-               err = _nfs4_free_stateid(server, stateid);
-               if (err != -NFS4ERR_DELAY)
-                       break;
-               nfs4_handle_exception(server, err, &exception);
-       } while (exception.retry);
-       return err;
+       struct rpc_task *task;
+       int ret;
+
+       task = _nfs41_free_stateid(server, stateid, true);
+       if (IS_ERR(task))
+               return PTR_ERR(task);
+       ret = rpc_wait_for_completion_task(task);
+       if (!ret)
+               ret = task->tk_status;
+       rpc_put_task(task);
+       return ret;
+}
+
+static int nfs41_free_lock_state(struct nfs_server *server, struct nfs4_lock_state *lsp)
+{
+       struct rpc_task *task;
+
+       task = _nfs41_free_stateid(server, &lsp->ls_stateid, false);
+       nfs4_free_lock_state(server, lsp);
+       if (IS_ERR(task))
+               return PTR_ERR(task);
+       rpc_put_task(task);
+       return 0;
  }
  
  static bool nfs41_match_stateid(const nfs4_stateid *s1,
@@ -6916,6 +6979,7 @@ static const struct nfs4_minor_version_ops nfs_v4_0_minor_ops = {
         .call_sync = _nfs4_call_sync,
         .match_stateid = nfs4_match_stateid,
         .find_root_sec = nfs4_find_root_sec,
+       .free_lock_state = nfs4_release_lockowner,
         .reboot_recovery_ops = &nfs40_reboot_recovery_ops,
         .nograce_recovery_ops = &nfs40_nograce_recovery_ops,
         .state_renewal_ops = &nfs40_state_renewal_ops,
@@ -6933,6 +6997,7 @@ static const struct nfs4_minor_version_ops nfs_v4_1_minor_ops = {
         .call_sync = nfs4_call_sync_sequence,
         .match_stateid = nfs41_match_stateid,
         .find_root_sec = nfs41_find_root_sec,
+       .free_lock_state = nfs41_free_lock_state,
         .reboot_recovery_ops = &nfs41_reboot_recovery_ops,
         .nograce_recovery_ops = &nfs41_nograce_recovery_ops,
         .state_renewal_ops = &nfs41_state_renewal_ops,
diff --git a/fs/nfs/nfs4state.c b/fs/nfs/nfs4state.c

index 0b32f9483b7afb07d993fb92568bec662b3dd9a4..300d17d85c0e03397d352746fc6fc775b39941e1 100644 (file)
--- a/fs/nfs/nfs4state.c
+++ b/fs/nfs/nfs4state.c
@@ -921,6 +921,7 @@ static struct nfs4_lock_state *nfs4_get_lock_state(struct nfs4_state *state, fl_
   */
  void nfs4_put_lock_state(struct nfs4_lock_state *lsp)
  {
+       struct nfs_server *server;
         struct nfs4_state *state;
  
         if (lsp == NULL)
@@ -932,11 +933,13 @@ void nfs4_put_lock_state(struct nfs4_lock_state *lsp)
         if (list_empty(&state->lock_states))
                 clear_bit(LK_STATE_IN_USE, &state->flags);
         spin_unlock(&state->state_lock);
+       server = state->owner->so_server;
         if (test_bit(NFS_LOCK_INITIALIZED, &lsp->ls_flags)) {
-               if (nfs4_release_lockowner(lsp) == 0)
-                       return;
-       }
-       nfs4_free_lock_state(lsp->ls_state->owner->so_server, lsp);
+               struct nfs_client *clp = server->nfs_client;
+
+               clp->cl_mvops->free_lock_state(server, lsp);
+       } else
+               nfs4_free_lock_state(server, lsp);
  }
  
  static void nfs4_fl_copy_lock(struct file_lock *dst, struct file_lock *src)
diff --git a/fs/nfs/nfs4xdr.c b/fs/nfs/nfs4xdr.c

index 3c79c5878c6da6689df58f4b0990a777798fdf9e..4be8d135ed61b19bc14f511b0013382211a64c0c 100644 (file)
--- a/fs/nfs/nfs4xdr.c
+++ b/fs/nfs/nfs4xdr.c
@@ -2003,7 +2003,7 @@ static void encode_free_stateid(struct xdr_stream *xdr,
                                 struct compound_hdr *hdr)
  {
         encode_op_hdr(xdr, OP_FREE_STATEID, decode_free_stateid_maxsz, hdr);
-       encode_nfs4_stateid(xdr, args->stateid);
+       encode_nfs4_stateid(xdr, &args->stateid);
  }
  #endif /* CONFIG_NFS_V4_1 */
  
diff --git a/fs/nfs/super.c b/fs/nfs/super.c

index 1bb071dca9ab1349deecd44e19ec7e0531b737ea..a366107a7331ad36864ba81b8b14ba940756ac70 100644 (file)
--- a/fs/nfs/super.c
+++ b/fs/nfs/super.c
@@ -1610,16 +1610,15 @@ out_security_failure:
  /*
   * Select a security flavor for this mount.  The selected flavor
   * is planted in args->auth_flavors[0].
+ *
+ * Returns 0 on success, -EACCES on failure.
   */
-static void nfs_select_flavor(struct nfs_parsed_mount_data *args,
+static int nfs_select_flavor(struct nfs_parsed_mount_data *args,
                               struct nfs_mount_request *request)
  {
         unsigned int i, count = *(request->auth_flav_len);
         rpc_authflavor_t flavor;
  
-       if (args->auth_flavors[0] != RPC_AUTH_MAXFLAVOR)
-               goto out;
-
         /*
          * The NFSv2 MNT operation does not return a flavor list.
          */
@@ -1633,6 +1632,25 @@ static void nfs_select_flavor(struct nfs_parsed_mount_data *args,
         if (count == 0)
                 goto out_default;
  
+       /*
+        * If the sec= mount option is used, the specified flavor or AUTH_NULL
+        * must be in the list returned by the server.
+        *
+        * AUTH_NULL has a special meaning when it's in the server list - it
+        * means that the server will ignore the rpc creds, so any flavor
+        * can be used.
+        */
+       if (args->auth_flavors[0] != RPC_AUTH_MAXFLAVOR) {
+               for (i = 0; i < count; i++) {
+                       if (args->auth_flavors[0] == request->auth_flavs[i] ||
+                           request->auth_flavs[i] == RPC_AUTH_NULL)
+                               goto out;
+               }
+               dfprintk(MOUNT, "NFS: auth flavor %d not supported by server\n",
+                       args->auth_flavors[0]);
+               goto out_err;
+       }
+
         /*
          * RFC 2623, section 2.7 suggests we SHOULD prefer the
          * flavor listed first.  However, some servers list
@@ -1653,12 +1671,29 @@ static void nfs_select_flavor(struct nfs_parsed_mount_data *args,
                 }
         }
  
+       /*
+        * As a last chance, see if the server list contains AUTH_NULL -
+        * if it does, use the default flavor.
+        */
+       for (i = 0; i < count; i++) {
+               if (request->auth_flavs[i] == RPC_AUTH_NULL)
+                       goto out_default;
+       }
+
+       dfprintk(MOUNT, "NFS: no auth flavors in common with server\n");
+       goto out_err;
+
  out_default:
-       flavor = RPC_AUTH_UNIX;
+       /* use default if flavor not already set */
+       flavor = (args->auth_flavors[0] == RPC_AUTH_MAXFLAVOR) ?
+               RPC_AUTH_UNIX : args->auth_flavors[0];
  out_set:
         args->auth_flavors[0] = flavor;
  out:
         dfprintk(MOUNT, "NFS: using auth flavor %d\n", args->auth_flavors[0]);
+       return 0;
+out_err:
+       return -EACCES;
  }
  
  /*
@@ -1721,8 +1756,7 @@ static int nfs_request_mount(struct nfs_parsed_mount_data *args,
                 return status;
         }
  
-       nfs_select_flavor(args, &request);
-       return 0;
+       return nfs_select_flavor(args, &request);
  }
  
  struct dentry *nfs_try_mount(int flags, const char *dev_name,
diff --git a/fs/nilfs2/inode.c b/fs/nilfs2/inode.c

index cf02f5530713e4354b20119d68346bcd25f8ec57..689fb608648e9a80db3c4e643feb3fe00ff1cfa4 100644 (file)
--- a/fs/nilfs2/inode.c
+++ b/fs/nilfs2/inode.c
@@ -25,7 +25,7 @@
  #include <linux/gfp.h>
  #include <linux/mpage.h>
  #include <linux/writeback.h>
-#include <linux/uio.h>
+#include <linux/aio.h>
  #include "nilfs.h"
  #include "btnode.h"
  #include "segment.h"
diff --git a/fs/ntfs/file.c b/fs/ntfs/file.c

index 1da4b81e6f76984ad7b5c9b52237d24c2fd1894f..c5670b8d198caf5aea663b224c02e7289d572565 100644 (file)
--- a/fs/ntfs/file.c
+++ b/fs/ntfs/file.c
@@ -27,6 +27,7 @@
  #include <linux/swap.h>
  #include <linux/uio.h>
  #include <linux/writeback.h>
+#include <linux/aio.h>
  
  #include <asm/page.h>
  #include <asm/uaccess.h>
diff --git a/fs/ntfs/inode.c b/fs/ntfs/inode.c

index d3e118cc6ffa8f0016c5eb93e246aded38a1cc14..2778b0255dc6c0c845f2b222ac45605f77737442 100644 (file)
--- a/fs/ntfs/inode.c
+++ b/fs/ntfs/inode.c
@@ -28,6 +28,7 @@
  #include <linux/quotaops.h>
  #include <linux/slab.h>
  #include <linux/log2.h>
+#include <linux/aio.h>
  
  #include "aops.h"
  #include "attrib.h"
diff --git a/fs/ocfs2/aops.h b/fs/ocfs2/aops.h

index ffb2da370a99d05dd4b919fc64a5483dbc2df7a3..f671e49beb348b5c33dfba3e3c4b5f322c73e43b 100644 (file)
--- a/fs/ocfs2/aops.h
+++ b/fs/ocfs2/aops.h
@@ -22,6 +22,8 @@
  #ifndef OCFS2_AOPS_H
  #define OCFS2_AOPS_H
  
+#include <linux/aio.h>
+
  handle_t *ocfs2_start_walk_page_trans(struct inode *inode,
                                                          struct page *page,
                                                          unsigned from,
diff --git a/fs/ocfs2/dlmglue.c b/fs/ocfs2/dlmglue.c

index 12ae194ac9431ba82755d0d5020a385f25d4afa9..3a44a648dae7709b1cd5431c426cf3b9e057e54a 100644 (file)
--- a/fs/ocfs2/dlmglue.c
+++ b/fs/ocfs2/dlmglue.c
@@ -2322,7 +2322,7 @@ int ocfs2_inode_lock_full_nested(struct inode *inode,
         status = __ocfs2_cluster_lock(osb, lockres, level, dlm_flags,
                                       arg_flags, subclass, _RET_IP_);
         if (status < 0) {
-               if (status != -EAGAIN && status != -EIOCBRETRY)
+               if (status != -EAGAIN)
                         mlog_errno(status);
                 goto bail;
         }
diff --git a/fs/ocfs2/inode.h b/fs/ocfs2/inode.h

index 88924a3133fae7c15ca3f5a5259b64eecd97022e..621fc73bf23de3f58ae331e8d7109b23d80e039e 100644 (file)
--- a/fs/ocfs2/inode.h
+++ b/fs/ocfs2/inode.h
@@ -147,8 +147,6 @@ void ocfs2_refresh_inode(struct inode *inode,
  int ocfs2_mark_inode_dirty(handle_t *handle,
                            struct inode *inode,
                            struct buffer_head *bh);
-int ocfs2_aio_read(struct file *file, struct kiocb *req, struct iocb *iocb);
-int ocfs2_aio_write(struct file *file, struct kiocb *req, struct iocb *iocb);
  struct buffer_head *ocfs2_bread(struct inode *inode,
                                 int block, int *err, int reada);
  
diff --git a/fs/pipe.c b/fs/pipe.c

index a029a14bacf1ff5d304caa6cf9471468fd3d39c0..d2c45e14e6d8126e41bc6c463a509e68946e9fa4 100644 (file)
--- a/fs/pipe.c
+++ b/fs/pipe.c
@@ -21,6 +21,7 @@
  #include <linux/audit.h>
  #include <linux/syscalls.h>
  #include <linux/fcntl.h>
+#include <linux/aio.h>
  
  #include <asm/uaccess.h>
  #include <asm/ioctls.h>
diff --git a/fs/read_write.c b/fs/read_write.c

index 90ba3b350e5063daa547f35814fd5e4c8c660a4f..03430008704e68fd74470e8dbb9fcb637dce3f97 100644 (file)
--- a/fs/read_write.c
+++ b/fs/read_write.c
@@ -9,6 +9,7 @@
  #include <linux/fcntl.h>
  #include <linux/file.h>
  #include <linux/uio.h>
+#include <linux/aio.h>
  #include <linux/fsnotify.h>
  #include <linux/security.h>
  #include <linux/export.h>
@@ -329,16 +330,6 @@ int rw_verify_area(int read_write, struct file *file, loff_t *ppos, size_t count
         return count > MAX_RW_COUNT ? MAX_RW_COUNT : count;
  }
  
-static void wait_on_retry_sync_kiocb(struct kiocb *iocb)
-{
-       set_current_state(TASK_UNINTERRUPTIBLE);
-       if (!kiocbIsKicked(iocb))
-               schedule();
-       else
-               kiocbClearKicked(iocb);
-       __set_current_state(TASK_RUNNING);
-}
-
  ssize_t do_sync_read(struct file *filp, char __user *buf, size_t len, loff_t *ppos)
  {
         struct iovec iov = { .iov_base = buf, .iov_len = len };
@@ -350,13 +341,7 @@ ssize_t do_sync_read(struct file *filp, char __user *buf, size_t len, loff_t *pp
         kiocb.ki_left = len;
         kiocb.ki_nbytes = len;
  
-       for (;;) {
-               ret = filp->f_op->aio_read(&kiocb, &iov, 1, kiocb.ki_pos);
-               if (ret != -EIOCBRETRY)
-                       break;
-               wait_on_retry_sync_kiocb(&kiocb);
-       }
-
+       ret = filp->f_op->aio_read(&kiocb, &iov, 1, kiocb.ki_pos);
         if (-EIOCBQUEUED == ret)
                 ret = wait_on_sync_kiocb(&kiocb);
         *ppos = kiocb.ki_pos;
@@ -406,13 +391,7 @@ ssize_t do_sync_write(struct file *filp, const char __user *buf, size_t len, lof
         kiocb.ki_left = len;
         kiocb.ki_nbytes = len;
  
-       for (;;) {
-               ret = filp->f_op->aio_write(&kiocb, &iov, 1, kiocb.ki_pos);
-               if (ret != -EIOCBRETRY)
-                       break;
-               wait_on_retry_sync_kiocb(&kiocb);
-       }
-
+       ret = filp->f_op->aio_write(&kiocb, &iov, 1, kiocb.ki_pos);
         if (-EIOCBQUEUED == ret)
                 ret = wait_on_sync_kiocb(&kiocb);
         *ppos = kiocb.ki_pos;
@@ -592,13 +571,7 @@ static ssize_t do_sync_readv_writev(struct file *filp, const struct iovec *iov,
         kiocb.ki_left = len;
         kiocb.ki_nbytes = len;
  
-       for (;;) {
-               ret = fn(&kiocb, iov, nr_segs, kiocb.ki_pos);
-               if (ret != -EIOCBRETRY)
-                       break;
-               wait_on_retry_sync_kiocb(&kiocb);
-       }
-
+       ret = fn(&kiocb, iov, nr_segs, kiocb.ki_pos);
         if (ret == -EIOCBQUEUED)
                 ret = wait_on_sync_kiocb(&kiocb);
         *ppos = kiocb.ki_pos;
diff --git a/fs/reiserfs/inode.c b/fs/reiserfs/inode.c

index ea5061fd4f3e04b055421c1d8d6c5657d250e758..77d6d47abc838be3a18acc837f9f97816974e6e1 100644 (file)
--- a/fs/reiserfs/inode.c
+++ b/fs/reiserfs/inode.c
@@ -18,6 +18,7 @@
  #include <linux/writeback.h>
  #include <linux/quotaops.h>
  #include <linux/swap.h>
+#include <linux/aio.h>
  
  int reiserfs_commit_write(struct file *f, struct page *page,
                           unsigned from, unsigned to);
diff --git a/fs/reiserfs/journal.c b/fs/reiserfs/journal.c

index afcadcc03e8ac87c7f25f3e2393b3c108daaf91d..742fdd4c209ae90500b49a78ce52d27689c03ba9 100644 (file)
--- a/fs/reiserfs/journal.c
+++ b/fs/reiserfs/journal.c
@@ -97,7 +97,7 @@ static int flush_commit_list(struct super_block *s,
  static int can_dirty(struct reiserfs_journal_cnode *cn);
  static int journal_join(struct reiserfs_transaction_handle *th,
                         struct super_block *sb, unsigned long nblocks);
-static int release_journal_dev(struct super_block *super,
+static void release_journal_dev(struct super_block *super,
                                struct reiserfs_journal *journal);
  static int dirty_one_transaction(struct super_block *s,
                                  struct reiserfs_journal_list *jl);
@@ -2532,23 +2532,13 @@ static void journal_list_init(struct super_block *sb)
         SB_JOURNAL(sb)->j_current_jl = alloc_journal_list(sb);
  }
  
-static int release_journal_dev(struct super_block *super,
+static void release_journal_dev(struct super_block *super,
                                struct reiserfs_journal *journal)
  {
-       int result;
-
-       result = 0;
-
         if (journal->j_dev_bd != NULL) {
-               result = blkdev_put(journal->j_dev_bd, journal->j_dev_mode);
+               blkdev_put(journal->j_dev_bd, journal->j_dev_mode);
                 journal->j_dev_bd = NULL;
         }
-
-       if (result != 0) {
-               reiserfs_warning(super, "sh-457",
-                                "Cannot release journal device: %i", result);
-       }
-       return result;
  }
  
  static int journal_init_dev(struct super_block *super,
diff --git a/fs/ubifs/file.c b/fs/ubifs/file.c

index f12189d2db1db7aa1620accfb4cc9d1eb09cf8a8..14374530784c683f36a15e52b64d792578a8ec07 100644 (file)
--- a/fs/ubifs/file.c
+++ b/fs/ubifs/file.c
@@ -50,6 +50,7 @@
   */
  
  #include "ubifs.h"
+#include <linux/aio.h>
  #include <linux/mount.h>
  #include <linux/namei.h>
  #include <linux/slab.h>
diff --git a/fs/udf/inode.c b/fs/udf/inode.c

index 7a12e48ad8196d51273fcc2ca66e89d077eb66ee..b6d15d349810fe5ca21649208bd86d220caf338c 100644 (file)
--- a/fs/udf/inode.c
+++ b/fs/udf/inode.c
@@ -38,6 +38,7 @@
  #include <linux/slab.h>
  #include <linux/crc-itu-t.h>
  #include <linux/mpage.h>
+#include <linux/aio.h>
  
  #include "udf_i.h"
  #include "udf_sb.h"
diff --git a/fs/xfs/xfs_aops.c b/fs/xfs/xfs_aops.c

index 3244c988d379a455117fba0c4696724515c1feec..2b2691b7342890e64e957d616dfb1c4aa2efc8c1 100644 (file)
--- a/fs/xfs/xfs_aops.c
+++ b/fs/xfs/xfs_aops.c
@@ -31,6 +31,7 @@
  #include "xfs_vnodeops.h"
  #include "xfs_trace.h"
  #include "xfs_bmap.h"
+#include <linux/aio.h>
  #include <linux/gfp.h>
  #include <linux/mpage.h>
  #include <linux/pagevec.h>
diff --git a/fs/xfs/xfs_file.c b/fs/xfs/xfs_file.c

index 054d60c0ac57c749dbe652b3ec2a02c34ef1ab9d..a5f2042aec8b27e730f0cbdedaef9eb50c9422f0 100644 (file)
--- a/fs/xfs/xfs_file.c
+++ b/fs/xfs/xfs_file.c
@@ -36,6 +36,7 @@
  #include "xfs_ioctl.h"
  #include "xfs_trace.h"
  
+#include <linux/aio.h>
  #include <linux/dcache.h>
  #include <linux/falloc.h>
  #include <linux/pagevec.h>
diff --git a/include/linux/acpi_dma.h b/include/linux/acpi_dma.h

new file mode 100644 (file)

index 0000000..d09deab
--- /dev/null
+++ b/include/linux/acpi_dma.h
@@ -0,0 +1,116 @@
+/*
+ * ACPI helpers for DMA request / controller
+ *
+ * Based on of_dma.h
+ *
+ * Copyright (C) 2013, Intel Corporation
+ * Author: Andy Shevchenko <andriy.shevchenko@linux.intel.com>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+
+#ifndef __LINUX_ACPI_DMA_H
+#define __LINUX_ACPI_DMA_H
+
+#include <linux/list.h>
+#include <linux/device.h>
+#include <linux/dmaengine.h>
+
+/**
+ * struct acpi_dma_spec - slave device DMA resources
+ * @chan_id:   channel unique id
+ * @slave_id:  request line unique id
+ * @dev:       struct device of the DMA controller to be used in the filter
+ *             function
+ */
+struct acpi_dma_spec {
+       int             chan_id;
+       int             slave_id;
+       struct device   *dev;
+};
+
+/**
+ * struct acpi_dma - representation of the registered DMAC
+ * @dma_controllers:   linked list node
+ * @dev:               struct device of this controller
+ * @acpi_dma_xlate:    callback function to find a suitable channel
+ * @data:              private data used by a callback function
+ */
+struct acpi_dma {
+       struct list_head        dma_controllers;
+       struct device           *dev;
+       struct dma_chan         *(*acpi_dma_xlate)
+                               (struct acpi_dma_spec *, struct acpi_dma *);
+       void                    *data;
+};
+
+/* Used with acpi_dma_simple_xlate() */
+struct acpi_dma_filter_info {
+       dma_cap_mask_t  dma_cap;
+       dma_filter_fn   filter_fn;
+};
+
+#ifdef CONFIG_DMA_ACPI
+
+int acpi_dma_controller_register(struct device *dev,
+               struct dma_chan *(*acpi_dma_xlate)
+               (struct acpi_dma_spec *, struct acpi_dma *),
+               void *data);
+int acpi_dma_controller_free(struct device *dev);
+int devm_acpi_dma_controller_register(struct device *dev,
+               struct dma_chan *(*acpi_dma_xlate)
+               (struct acpi_dma_spec *, struct acpi_dma *),
+               void *data);
+void devm_acpi_dma_controller_free(struct device *dev);
+
+struct dma_chan *acpi_dma_request_slave_chan_by_index(struct device *dev,
+                                                     size_t index);
+struct dma_chan *acpi_dma_request_slave_chan_by_name(struct device *dev,
+                                                    const char *name);
+
+struct dma_chan *acpi_dma_simple_xlate(struct acpi_dma_spec *dma_spec,
+                                      struct acpi_dma *adma);
+#else
+
+static inline int acpi_dma_controller_register(struct device *dev,
+               struct dma_chan *(*acpi_dma_xlate)
+               (struct acpi_dma_spec *, struct acpi_dma *),
+               void *data)
+{
+       return -ENODEV;
+}
+static inline int acpi_dma_controller_free(struct device *dev)
+{
+       return -ENODEV;
+}
+static inline int devm_acpi_dma_controller_register(struct device *dev,
+               struct dma_chan *(*acpi_dma_xlate)
+               (struct acpi_dma_spec *, struct acpi_dma *),
+               void *data)
+{
+       return -ENODEV;
+}
+static inline void devm_acpi_dma_controller_free(struct device *dev)
+{
+}
+
+static inline struct dma_chan *acpi_dma_request_slave_chan_by_index(
+               struct device *dev, size_t index)
+{
+       return NULL;
+}
+static inline struct dma_chan *acpi_dma_request_slave_chan_by_name(
+               struct device *dev, const char *name)
+{
+       return NULL;
+}
+
+#define acpi_dma_simple_xlate  NULL
+
+#endif
+
+#define acpi_dma_request_slave_channel acpi_dma_request_slave_chan_by_index
+
+#endif /* __LINUX_ACPI_DMA_H */
diff --git a/include/linux/aio.h b/include/linux/aio.h

index 31ff6dba4872a96c6bbfa5a78bbe27a782affc41..1bdf965339f9bef4222a5bc739efc0b23807e35e 100644 (file)
--- a/include/linux/aio.h
+++ b/include/linux/aio.h
@@ -9,91 +9,32 @@
  
  #include <linux/atomic.h>
  
-#define AIO_MAXSEGS            4
-#define AIO_KIOGRP_NR_ATOMIC   8
-
  struct kioctx;
+struct kiocb;
  
-/* Notes on cancelling a kiocb:
- *     If a kiocb is cancelled, aio_complete may return 0 to indicate 
- *     that cancel has not yet disposed of the kiocb.  All cancel 
- *     operations *must* call aio_put_req to dispose of the kiocb 
- *     to guard against races with the completion code.
- */
-#define KIOCB_C_CANCELLED      0x01
-#define KIOCB_C_COMPLETE       0x02
-
-#define KIOCB_SYNC_KEY         (~0U)
+#define KIOCB_KEY              0
  
-/* ki_flags bits */
  /*
- * This may be used for cancel/retry serialization in the future, but
- * for now it's unused and we probably don't want modules to even
- * think they can use it.
+ * We use ki_cancel == KIOCB_CANCELLED to indicate that a kiocb has been either
+ * cancelled or completed (this makes a certain amount of sense because
+ * successful cancellation - io_cancel() - does deliver the completion to
+ * userspace).
+ *
+ * And since most things don't implement kiocb cancellation and we'd really like
+ * kiocb completion to be lockless when possible, we use ki_cancel to
+ * synchronize cancellation and completion - we only set it to KIOCB_CANCELLED
+ * with xchg() or cmpxchg(), see batch_complete_aio() and kiocb_cancel().
   */
-/* #define KIF_LOCKED          0 */
-#define KIF_KICKED             1
-#define KIF_CANCELLED          2
-
-#define kiocbTryLock(iocb)     test_and_set_bit(KIF_LOCKED, &(iocb)->ki_flags)
-#define kiocbTryKick(iocb)     test_and_set_bit(KIF_KICKED, &(iocb)->ki_flags)
+#define KIOCB_CANCELLED                ((void *) (~0ULL))
  
-#define kiocbSetLocked(iocb)   set_bit(KIF_LOCKED, &(iocb)->ki_flags)
-#define kiocbSetKicked(iocb)   set_bit(KIF_KICKED, &(iocb)->ki_flags)
-#define kiocbSetCancelled(iocb)        set_bit(KIF_CANCELLED, &(iocb)->ki_flags)
+typedef int (kiocb_cancel_fn)(struct kiocb *, struct io_event *);
  
-#define kiocbClearLocked(iocb) clear_bit(KIF_LOCKED, &(iocb)->ki_flags)
-#define kiocbClearKicked(iocb) clear_bit(KIF_KICKED, &(iocb)->ki_flags)
-#define kiocbClearCancelled(iocb)      clear_bit(KIF_CANCELLED, &(iocb)->ki_flags)
-
-#define kiocbIsLocked(iocb)    test_bit(KIF_LOCKED, &(iocb)->ki_flags)
-#define kiocbIsKicked(iocb)    test_bit(KIF_KICKED, &(iocb)->ki_flags)
-#define kiocbIsCancelled(iocb) test_bit(KIF_CANCELLED, &(iocb)->ki_flags)
-
-/* is there a better place to document function pointer methods? */
-/**
- * ki_retry    -       iocb forward progress callback
- * @kiocb:     The kiocb struct to advance by performing an operation.
- *
- * This callback is called when the AIO core wants a given AIO operation
- * to make forward progress.  The kiocb argument describes the operation
- * that is to be performed.  As the operation proceeds, perhaps partially,
- * ki_retry is expected to update the kiocb with progress made.  Typically
- * ki_retry is set in the AIO core and it itself calls file_operations
- * helpers.
- *
- * ki_retry's return value determines when the AIO operation is completed
- * and an event is generated in the AIO event ring.  Except the special
- * return values described below, the value that is returned from ki_retry
- * is transferred directly into the completion ring as the operation's
- * resulting status.  Once this has happened ki_retry *MUST NOT* reference
- * the kiocb pointer again.
- *
- * If ki_retry returns -EIOCBQUEUED it has made a promise that aio_complete()
- * will be called on the kiocb pointer in the future.  The AIO core will
- * not ask the method again -- ki_retry must ensure forward progress.
- * aio_complete() must be called once and only once in the future, multiple
- * calls may result in undefined behaviour.
- *
- * If ki_retry returns -EIOCBRETRY it has made a promise that kick_iocb()
- * will be called on the kiocb pointer in the future.  This may happen
- * through generic helpers that associate kiocb->ki_wait with a wait
- * queue head that ki_retry uses via current->io_wait.  It can also happen
- * with custom tracking and manual calls to kick_iocb(), though that is
- * discouraged.  In either case, kick_iocb() must be called once and only
- * once.  ki_retry must ensure forward progress, the AIO core will wait
- * indefinitely for kick_iocb() to be called.
- */
  struct kiocb {
-       struct list_head        ki_run_list;
-       unsigned long           ki_flags;
-       int                     ki_users;
-       unsigned                ki_key;         /* id of this request */
+       atomic_t                ki_users;
  
         struct file             *ki_filp;
-       struct kioctx           *ki_ctx;        /* may be NULL for sync ops */
-       int                     (*ki_cancel)(struct kiocb *, struct io_event *);
-       ssize_t                 (*ki_retry)(struct kiocb *);
+       struct kioctx           *ki_ctx;        /* NULL for sync ops */
+       kiocb_cancel_fn         *ki_cancel;
         void                    (*ki_dtor)(struct kiocb *);
  
         union {
@@ -117,7 +58,6 @@ struct kiocb {
  
         struct list_head        ki_list;        /* the aio core uses this
                                                  * for cancellation */
-       struct list_head        ki_batch;       /* batch allocation */
  
         /*
          * If the aio_resfd field of the userspace iocb is not zero,
@@ -128,106 +68,40 @@ struct kiocb {
  
  static inline bool is_sync_kiocb(struct kiocb *kiocb)
  {
-       return kiocb->ki_key == KIOCB_SYNC_KEY;
+       return kiocb->ki_ctx == NULL;
  }
  
  static inline void init_sync_kiocb(struct kiocb *kiocb, struct file *filp)
  {
         *kiocb = (struct kiocb) {
-                       .ki_users = 1,
-                       .ki_key = KIOCB_SYNC_KEY,
+                       .ki_users = ATOMIC_INIT(1),
+                       .ki_ctx = NULL,
                         .ki_filp = filp,
                         .ki_obj.tsk = current,
                 };
  }
  
-#define AIO_RING_MAGIC                 0xa10a10a1
-#define AIO_RING_COMPAT_FEATURES       1
-#define AIO_RING_INCOMPAT_FEATURES     0
-struct aio_ring {
-       unsigned        id;     /* kernel internal index number */
-       unsigned        nr;     /* number of io_events */
-       unsigned        head;
-       unsigned        tail;
-
-       unsigned        magic;
-       unsigned        compat_features;
-       unsigned        incompat_features;
-       unsigned        header_length;  /* size of aio_ring */
-
-
-       struct io_event         io_events[0];
-}; /* 128 bytes + ring size */
-
-#define AIO_RING_PAGES 8
-struct aio_ring_info {
-       unsigned long           mmap_base;
-       unsigned long           mmap_size;
-
-       struct page             **ring_pages;
-       spinlock_t              ring_lock;
-       long                    nr_pages;
-
-       unsigned                nr, tail;
-
-       struct page             *internal_pages[AIO_RING_PAGES];
-};
-
-static inline unsigned aio_ring_avail(struct aio_ring_info *info,
-                                       struct aio_ring *ring)
-{
-       return (ring->head + info->nr - 1 - ring->tail) % info->nr;
-}
-
-struct kioctx {
-       atomic_t                users;
-       int                     dead;
-       struct mm_struct        *mm;
-
-       /* This needs improving */
-       unsigned long           user_id;
-       struct hlist_node       list;
-
-       wait_queue_head_t       wait;
-
-       spinlock_t              ctx_lock;
-
-       int                     reqs_active;
-       struct list_head        active_reqs;    /* used for cancellation */
-       struct list_head        run_list;       /* used for kicked reqs */
-
-       /* sys_io_setup currently limits this to an unsigned int */
-       unsigned                max_reqs;
-
-       struct aio_ring_info    ring_info;
-
-       struct delayed_work     wq;
-
-       struct rcu_head         rcu_head;
-};
-
  /* prototypes */
-extern unsigned aio_max_size;
-
  #ifdef CONFIG_AIO
  extern ssize_t wait_on_sync_kiocb(struct kiocb *iocb);
-extern int aio_put_req(struct kiocb *iocb);
-extern void kick_iocb(struct kiocb *iocb);
-extern int aio_complete(struct kiocb *iocb, long res, long res2);
+extern void aio_put_req(struct kiocb *iocb);
+extern void aio_complete(struct kiocb *iocb, long res, long res2);
  struct mm_struct;
  extern void exit_aio(struct mm_struct *mm);
  extern long do_io_submit(aio_context_t ctx_id, long nr,
                          struct iocb __user *__user *iocbpp, bool compat);
+void kiocb_set_cancel_fn(struct kiocb *req, kiocb_cancel_fn *cancel);
  #else
  static inline ssize_t wait_on_sync_kiocb(struct kiocb *iocb) { return 0; }
-static inline int aio_put_req(struct kiocb *iocb) { return 0; }
-static inline void kick_iocb(struct kiocb *iocb) { }
-static inline int aio_complete(struct kiocb *iocb, long res, long res2) { return 0; }
+static inline void aio_put_req(struct kiocb *iocb) { }
+static inline void aio_complete(struct kiocb *iocb, long res, long res2) { }
  struct mm_struct;
  static inline void exit_aio(struct mm_struct *mm) { }
  static inline long do_io_submit(aio_context_t ctx_id, long nr,
                                 struct iocb __user * __user *iocbpp,
                                 bool compat) { return 0; }
+static inline void kiocb_set_cancel_fn(struct kiocb *req,
+                                      kiocb_cancel_fn *cancel) { }
  #endif /* CONFIG_AIO */
  
  static inline struct kiocb *list_kiocb(struct list_head *h)
diff --git a/include/linux/backing-dev.h b/include/linux/backing-dev.h

index 350459910fe138a3d47573a07e8b125c62f36564..c3881553f7d15ef029323e49bcbc9e48365eb5db 100644 (file)
--- a/include/linux/backing-dev.h
+++ b/include/linux/backing-dev.h
@@ -18,6 +18,7 @@
  #include <linux/writeback.h>
  #include <linux/atomic.h>
  #include <linux/sysctl.h>
+#include <linux/workqueue.h>
  
  struct page;
  struct device;
@@ -27,7 +28,6 @@ struct dentry;
   * Bits in backing_dev_info.state
   */
  enum bdi_state {
-       BDI_pending,            /* On its way to being activated */
         BDI_wb_alloc,           /* Default embedded wb allocated */
         BDI_async_congested,    /* The async (write) queue is getting full */
         BDI_sync_congested,     /* The sync queue is getting full */
@@ -53,10 +53,8 @@ struct bdi_writeback {
         unsigned int nr;
  
         unsigned long last_old_flush;   /* last old data flush */
-       unsigned long last_active;      /* last time bdi thread was active */
  
-       struct task_struct *task;       /* writeback thread */
-       struct timer_list wakeup_timer; /* used for delayed bdi thread wakeup */
+       struct delayed_work dwork;      /* work item used for writeback */
         struct list_head b_dirty;       /* dirty inodes */
         struct list_head b_io;          /* parked for writeback */
         struct list_head b_more_io;     /* parked for more writeback */
@@ -123,14 +121,15 @@ int bdi_setup_and_register(struct backing_dev_info *, char *, unsigned int);
  void bdi_start_writeback(struct backing_dev_info *bdi, long nr_pages,
                         enum wb_reason reason);
  void bdi_start_background_writeback(struct backing_dev_info *bdi);
-int bdi_writeback_thread(void *data);
+void bdi_writeback_workfn(struct work_struct *work);
  int bdi_has_dirty_io(struct backing_dev_info *bdi);
  void bdi_wakeup_thread_delayed(struct backing_dev_info *bdi);
  void bdi_lock_two(struct bdi_writeback *wb1, struct bdi_writeback *wb2);
  
  extern spinlock_t bdi_lock;
  extern struct list_head bdi_list;
-extern struct list_head bdi_pending_list;
+
+extern struct workqueue_struct *bdi_wq;
  
  static inline int wb_has_dirty_io(struct bdi_writeback *wb)
  {
@@ -336,11 +335,6 @@ static inline bool bdi_cap_swap_backed(struct backing_dev_info *bdi)
         return bdi->capabilities & BDI_CAP_SWAP_BACKED;
  }
  
-static inline bool bdi_cap_flush_forker(struct backing_dev_info *bdi)
-{
-       return bdi == &default_backing_dev_info;
-}
-
  static inline bool mapping_cap_writeback_dirty(struct address_space *mapping)
  {
         return bdi_cap_writeback_dirty(mapping->backing_dev_info);
diff --git a/include/linux/bio.h b/include/linux/bio.h

index 820e7aaad4fdbbf432b188b083662b5015bd3905..ef24466d8f82516a76029577df9c8f1ec530cf20 100644 (file)
--- a/include/linux/bio.h
+++ b/include/linux/bio.h
@@ -67,6 +67,7 @@
  #define bio_offset(bio)                bio_iovec((bio))->bv_offset
  #define bio_segments(bio)      ((bio)->bi_vcnt - (bio)->bi_idx)
  #define bio_sectors(bio)       ((bio)->bi_size >> 9)
+#define bio_end_sector(bio)    ((bio)->bi_sector + bio_sectors((bio)))
  
  static inline unsigned int bio_cur_bytes(struct bio *bio)
  {
@@ -84,11 +85,6 @@ static inline void *bio_data(struct bio *bio)
         return NULL;
  }
  
-static inline int bio_has_allocated_vec(struct bio *bio)
-{
-       return bio->bi_io_vec && bio->bi_io_vec != bio->bi_inline_vecs;
-}
-
  /*
   * will die
   */
@@ -136,16 +132,27 @@ static inline int bio_has_allocated_vec(struct bio *bio)
  #define bio_io_error(bio) bio_endio((bio), -EIO)
  
  /*
- * drivers should not use the __ version unless they _really_ want to
- * run through the entire bio and not just pending pieces
+ * drivers should not use the __ version unless they _really_ know what
+ * they're doing
   */
  #define __bio_for_each_segment(bvl, bio, i, start_idx)                 \
         for (bvl = bio_iovec_idx((bio), (start_idx)), i = (start_idx);  \
              i < (bio)->bi_vcnt;                                        \
              bvl++, i++)
  
+/*
+ * drivers should _never_ use the all version - the bio may have been split
+ * before it got to the driver and the driver won't own all of it
+ */
+#define bio_for_each_segment_all(bvl, bio, i)                          \
+       for (i = 0;                                                     \
+            bvl = bio_iovec_idx((bio), (i)), i < (bio)->bi_vcnt;       \
+            i++)
+
  #define bio_for_each_segment(bvl, bio, i)                              \
-       __bio_for_each_segment(bvl, bio, i, (bio)->bi_idx)
+       for (i = (bio)->bi_idx;                                         \
+            bvl = bio_iovec_idx((bio), (i)), i < (bio)->bi_vcnt;       \
+            i++)
  
  /*
   * get a reference to a bio, so it won't disappear. the intended use is
@@ -180,9 +187,12 @@ struct bio_integrity_payload {
         unsigned short          bip_slab;       /* slab the bip came from */
         unsigned short          bip_vcnt;       /* # of integrity bio_vecs */
         unsigned short          bip_idx;        /* current bip_vec index */
+       unsigned                bip_owns_buf:1; /* should free bip_buf */
  
         struct work_struct      bip_work;       /* I/O completion */
-       struct bio_vec          bip_vec[0];     /* embedded bvec array */
+
+       struct bio_vec          *bip_vec;
+       struct bio_vec          bip_inline_vecs[0];/* embedded bvec array */
  };
  #endif /* CONFIG_BLK_DEV_INTEGRITY */
  
@@ -211,6 +221,7 @@ extern void bio_pair_release(struct bio_pair *dbio);
  
  extern struct bio_set *bioset_create(unsigned int, unsigned int);
  extern void bioset_free(struct bio_set *);
+extern mempool_t *biovec_create_pool(struct bio_set *bs, int pool_entries);
  
  extern struct bio *bio_alloc_bioset(gfp_t, int, struct bio_set *);
  extern void bio_put(struct bio *);
@@ -245,6 +256,9 @@ extern void bio_endio(struct bio *, int);
  struct request_queue;
  extern int bio_phys_segments(struct request_queue *, struct bio *);
  
+extern int submit_bio_wait(int rw, struct bio *bio);
+extern void bio_advance(struct bio *, unsigned);
+
  extern void bio_init(struct bio *);
  extern void bio_reset(struct bio *);
  
@@ -279,6 +293,9 @@ static inline void bio_flush_dcache_pages(struct bio *bi)
  }
  #endif
  
+extern void bio_copy_data(struct bio *dst, struct bio *src);
+extern int bio_alloc_pages(struct bio *bio, gfp_t gfp);
+
  extern struct bio *bio_copy_user(struct request_queue *, struct rq_map_data *,
                                  unsigned long, unsigned int, int, gfp_t);
  extern struct bio *bio_copy_user_iov(struct request_queue *,
@@ -286,8 +303,8 @@ extern struct bio *bio_copy_user_iov(struct request_queue *,
                                      int, int, gfp_t);
  extern int bio_uncopy_user(struct bio *);
  void zero_fill_bio(struct bio *bio);
-extern struct bio_vec *bvec_alloc_bs(gfp_t, int, unsigned long *, struct bio_set *);
-extern void bvec_free_bs(struct bio_set *, struct bio_vec *, unsigned int);
+extern struct bio_vec *bvec_alloc(gfp_t, int, unsigned long *, mempool_t *);
+extern void bvec_free(mempool_t *, struct bio_vec *, unsigned int);
  extern unsigned int bvec_nr_vecs(unsigned short idx);
  
  #ifdef CONFIG_BLK_CGROUP
@@ -298,39 +315,6 @@ static inline int bio_associate_current(struct bio *bio) { return -ENOENT; }
  static inline void bio_disassociate_task(struct bio *bio) { }
  #endif /* CONFIG_BLK_CGROUP */
  
-/*
- * bio_set is used to allow other portions of the IO system to
- * allocate their own private memory pools for bio and iovec structures.
- * These memory pools in turn all allocate from the bio_slab
- * and the bvec_slabs[].
- */
-#define BIO_POOL_SIZE 2
-#define BIOVEC_NR_POOLS 6
-#define BIOVEC_MAX_IDX (BIOVEC_NR_POOLS - 1)
-
-struct bio_set {
-       struct kmem_cache *bio_slab;
-       unsigned int front_pad;
-
-       mempool_t *bio_pool;
-#if defined(CONFIG_BLK_DEV_INTEGRITY)
-       mempool_t *bio_integrity_pool;
-#endif
-       mempool_t *bvec_pool;
-};
-
-struct biovec_slab {
-       int nr_vecs;
-       char *name;
-       struct kmem_cache *slab;
-};
-
-/*
- * a small number of entries is fine, not going to be performance critical.
- * basically we just need to survive
- */
-#define BIO_SPLIT_ENTRIES 2
-
  #ifdef CONFIG_HIGHMEM
  /*
   * remember never ever reenable interrupts between a bvec_kmap_irq and
@@ -527,6 +511,49 @@ static inline struct bio *bio_list_get(struct bio_list *bl)
         return bio;
  }
  
+/*
+ * bio_set is used to allow other portions of the IO system to
+ * allocate their own private memory pools for bio and iovec structures.
+ * These memory pools in turn all allocate from the bio_slab
+ * and the bvec_slabs[].
+ */
+#define BIO_POOL_SIZE 2
+#define BIOVEC_NR_POOLS 6
+#define BIOVEC_MAX_IDX (BIOVEC_NR_POOLS - 1)
+
+struct bio_set {
+       struct kmem_cache *bio_slab;
+       unsigned int front_pad;
+
+       mempool_t *bio_pool;
+       mempool_t *bvec_pool;
+#if defined(CONFIG_BLK_DEV_INTEGRITY)
+       mempool_t *bio_integrity_pool;
+       mempool_t *bvec_integrity_pool;
+#endif
+
+       /*
+        * Deadlock avoidance for stacking block drivers: see comments in
+        * bio_alloc_bioset() for details
+        */
+       spinlock_t              rescue_lock;
+       struct bio_list         rescue_list;
+       struct work_struct      rescue_work;
+       struct workqueue_struct *rescue_workqueue;
+};
+
+struct biovec_slab {
+       int nr_vecs;
+       char *name;
+       struct kmem_cache *slab;
+};
+
+/*
+ * a small number of entries is fine, not going to be performance critical.
+ * basically we just need to survive
+ */
+#define BIO_SPLIT_ENTRIES 2
+
  #if defined(CONFIG_BLK_DEV_INTEGRITY)
  
  #define bip_vec_idx(bip, idx)  (&(bip->bip_vec[(idx)]))
diff --git a/include/linux/blk_types.h b/include/linux/blk_types.h

index 22990cf4439d2e8b91fedc0be9d254985f7ab64a..fa1abeb45b7602a4f0c1a4098f05f63d7a075281 100644 (file)
--- a/include/linux/blk_types.h
+++ b/include/linux/blk_types.h
@@ -118,6 +118,7 @@ struct bio {
   * BIO_POOL_IDX()
   */
  #define BIO_RESET_BITS 13
+#define BIO_OWNS_VEC   13      /* bio_free() should free bvec */
  
  #define bio_flagged(bio, flag) ((bio)->bi_flags & (1 << (flag)))
  
@@ -176,6 +177,7 @@ enum rq_flag_bits {
         __REQ_IO_STAT,          /* account I/O stat */
         __REQ_MIXED_MERGE,      /* merge of different types, fail separately */
         __REQ_KERNEL,           /* direct IO to kernel pages */
+       __REQ_PM,               /* runtime pm request */
         __REQ_NR_BITS,          /* stops here */
  };
  
@@ -198,6 +200,8 @@ enum rq_flag_bits {
          REQ_SECURE)
  #define REQ_CLONE_MASK         REQ_COMMON_MASK
  
+#define BIO_NO_ADVANCE_ITER_MASK       (REQ_DISCARD|REQ_WRITE_SAME)
+
  /* This mask is used for both bio and request merge checking */
  #define REQ_NOMERGE_FLAGS \
         (REQ_NOMERGE | REQ_STARTED | REQ_SOFTBARRIER | REQ_FLUSH | REQ_FUA)
@@ -224,5 +228,6 @@ enum rq_flag_bits {
  #define REQ_MIXED_MERGE                (1 << __REQ_MIXED_MERGE)
  #define REQ_SECURE             (1 << __REQ_SECURE)
  #define REQ_KERNEL             (1 << __REQ_KERNEL)
+#define REQ_PM                 (1 << __REQ_PM)
  
  #endif /* __LINUX_BLK_TYPES_H */
diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h

index 78feda9bbae2632b2c0e61493e31b909af55f522..2fdb4a451b49bd626d9415b231c76b7ac927cf69 100644 (file)
--- a/include/linux/blkdev.h
+++ b/include/linux/blkdev.h
@@ -361,6 +361,12 @@ struct request_queue {
          */
         struct kobject kobj;
  
+#ifdef CONFIG_PM_RUNTIME
+       struct device           *dev;
+       int                     rpm_status;
+       unsigned int            nr_pending;
+#endif
+
         /*
          * queue settings
          */
@@ -838,7 +844,7 @@ static inline unsigned int blk_queue_get_max_sectors(struct request_queue *q,
                                                      unsigned int cmd_flags)
  {
         if (unlikely(cmd_flags & REQ_DISCARD))
-               return q->limits.max_discard_sectors;
+               return min(q->limits.max_discard_sectors, UINT_MAX >> 9);
  
         if (unlikely(cmd_flags & REQ_WRITE_SAME))
                 return q->limits.max_write_same_sectors;
@@ -960,6 +966,27 @@ struct request_queue *blk_alloc_queue(gfp_t);
  struct request_queue *blk_alloc_queue_node(gfp_t, int);
  extern void blk_put_queue(struct request_queue *);
  
+/*
+ * block layer runtime pm functions
+ */
+#ifdef CONFIG_PM_RUNTIME
+extern void blk_pm_runtime_init(struct request_queue *q, struct device *dev);
+extern int blk_pre_runtime_suspend(struct request_queue *q);
+extern void blk_post_runtime_suspend(struct request_queue *q, int err);
+extern void blk_pre_runtime_resume(struct request_queue *q);
+extern void blk_post_runtime_resume(struct request_queue *q, int err);
+#else
+static inline void blk_pm_runtime_init(struct request_queue *q,
+       struct device *dev) {}
+static inline int blk_pre_runtime_suspend(struct request_queue *q)
+{
+       return -ENOSYS;
+}
+static inline void blk_post_runtime_suspend(struct request_queue *q, int err) {}
+static inline void blk_pre_runtime_resume(struct request_queue *q) {}
+static inline void blk_post_runtime_resume(struct request_queue *q, int err) {}
+#endif
+
  /*
   * blk_plug permits building a queue of related requests by holding the I/O
   * fragments for a short period. This allows merging of sequential requests
@@ -1484,7 +1511,7 @@ static inline bool blk_integrity_is_initialized(struct gendisk *g)
  
  struct block_device_operations {
         int (*open) (struct block_device *, fmode_t);
-       int (*release) (struct gendisk *, fmode_t);
+       void (*release) (struct gendisk *, fmode_t);
         int (*ioctl) (struct block_device *, fmode_t, unsigned, unsigned long);
         int (*compat_ioctl) (struct block_device *, fmode_t, unsigned, unsigned long);
         int (*direct_access) (struct block_device *, sector_t,
diff --git a/include/linux/cgroup.h b/include/linux/cgroup.h

index 3bff9ce09cf7b92fcd04fb06c19864b0a07df91b..5047355b9a0fcf4e2b07076b414c255d2c9b7510 100644 (file)
--- a/include/linux/cgroup.h
+++ b/include/linux/cgroup.h
@@ -28,6 +28,7 @@ struct cgroup_subsys;
  struct inode;
  struct cgroup;
  struct css_id;
+struct eventfd_ctx;
  
  extern int cgroup_init_early(void);
  extern int cgroup_init(void);
diff --git a/include/linux/cgroup_subsys.h b/include/linux/cgroup_subsys.h

index f204a7a9cf382139a10c77302b4f5aecf1c118be..6e7ec64b69ab4b7aa5ee7aacef569f633f0de296 100644 (file)
--- a/include/linux/cgroup_subsys.h
+++ b/include/linux/cgroup_subsys.h
@@ -78,3 +78,9 @@ SUBSYS(hugetlb)
  #endif
  
  /* */
+
+#ifdef CONFIG_CGROUP_BCACHE
+SUBSYS(bcache)
+#endif
+
+/* */
diff --git a/include/linux/cpu_cooling.h b/include/linux/cpu_cooling.h

index 40b4ef54cc7d50661553545724951e4e47b4c4e8..282e27028418adc27dcee1f8b7831cd36c4a80ec 100644 (file)
--- a/include/linux/cpu_cooling.h
+++ b/include/linux/cpu_cooling.h
@@ -25,34 +25,39 @@
  #define __CPU_COOLING_H__
  
  #include <linux/thermal.h>
+#include <linux/cpumask.h>
  
-#define CPUFREQ_COOLING_START          0
-#define CPUFREQ_COOLING_STOP           1
-
-#if defined(CONFIG_CPU_THERMAL) || defined(CONFIG_CPU_THERMAL_MODULE)
+#ifdef CONFIG_CPU_THERMAL
  /**
   * cpufreq_cooling_register - function to create cpufreq cooling device.
   * @clip_cpus: cpumask of cpus where the frequency constraints will happen
   */
-struct thermal_cooling_device *cpufreq_cooling_register(
-               const struct cpumask *clip_cpus);
+struct thermal_cooling_device *
+cpufreq_cooling_register(const struct cpumask *clip_cpus);
  
  /**
   * cpufreq_cooling_unregister - function to remove cpufreq cooling device.
   * @cdev: thermal cooling device pointer.
   */
  void cpufreq_cooling_unregister(struct thermal_cooling_device *cdev);
+
+unsigned long cpufreq_cooling_get_level(unsigned int, unsigned int);
  #else /* !CONFIG_CPU_THERMAL */
-static inline struct thermal_cooling_device *cpufreq_cooling_register(
-       const struct cpumask *clip_cpus)
+static inline struct thermal_cooling_device *
+cpufreq_cooling_register(const struct cpumask *clip_cpus)
  {
         return NULL;
  }
-static inline void cpufreq_cooling_unregister(
-               struct thermal_cooling_device *cdev)
+static inline
+void cpufreq_cooling_unregister(struct thermal_cooling_device *cdev)
  {
         return;
  }
+static inline
+unsigned long cpufreq_cooling_get_level(unsigned int, unsigned int)
+{
+       return THERMAL_CSTATE_INVALID;
+}
  #endif /* CONFIG_CPU_THERMAL */
  
  #endif /* __CPU_COOLING_H__ */
diff --git a/include/linux/dmaengine.h b/include/linux/dmaengine.h

index 91ac8da2502017a0463f73e2394c2778bd78376d..96d3e4ab11a91a4ea28d0f864d1cfbee6e249127 100644 (file)
--- a/include/linux/dmaengine.h
+++ b/include/linux/dmaengine.h
@@ -967,8 +967,9 @@ enum dma_status dma_sync_wait(struct dma_chan *chan, dma_cookie_t cookie);
  #ifdef CONFIG_DMA_ENGINE
  enum dma_status dma_wait_for_async_tx(struct dma_async_tx_descriptor *tx);
  void dma_issue_pending_all(void);
-struct dma_chan *__dma_request_channel(dma_cap_mask_t *mask, dma_filter_fn fn, void *fn_param);
-struct dma_chan *dma_request_slave_channel(struct device *dev, char *name);
+struct dma_chan *__dma_request_channel(const dma_cap_mask_t *mask,
+                                       dma_filter_fn fn, void *fn_param);
+struct dma_chan *dma_request_slave_channel(struct device *dev, const char *name);
  void dma_release_channel(struct dma_chan *chan);
  #else
  static inline enum dma_status dma_wait_for_async_tx(struct dma_async_tx_descriptor *tx)
@@ -978,13 +979,13 @@ static inline enum dma_status dma_wait_for_async_tx(struct dma_async_tx_descript
  static inline void dma_issue_pending_all(void)
  {
  }
-static inline struct dma_chan *__dma_request_channel(dma_cap_mask_t *mask,
+static inline struct dma_chan *__dma_request_channel(const dma_cap_mask_t *mask,
                                               dma_filter_fn fn, void *fn_param)
  {
         return NULL;
  }
  static inline struct dma_chan *dma_request_slave_channel(struct device *dev,
-                                                        char *name)
+                                                        const char *name)
  {
         return NULL;
  }
@@ -1005,9 +1006,9 @@ struct dma_chan *net_dma_find_channel(void);
         __dma_request_slave_channel_compat(&(mask), x, y, dev, name)
  
  static inline struct dma_chan
-*__dma_request_slave_channel_compat(dma_cap_mask_t *mask, dma_filter_fn fn,
-                                 void *fn_param, struct device *dev,
-                                 char *name)
+*__dma_request_slave_channel_compat(const dma_cap_mask_t *mask,
+                                 dma_filter_fn fn, void *fn_param,
+                                 struct device *dev, char *name)
  {
         struct dma_chan *chan;
  
diff --git a/include/linux/drbd.h b/include/linux/drbd.h

index 0c5a18ec322ce2b1ef963d8364d868adc3a82f98..1b4d4ee1168ffb3b8ac90c42aaad073753573b1f 100644 (file)
--- a/include/linux/drbd.h
+++ b/include/linux/drbd.h
@@ -52,7 +52,7 @@
  #endif
  
  extern const char *drbd_buildtag(void);
-#define REL_VERSION "8.4.2"
+#define REL_VERSION "8.4.3"
  #define API_VERSION 1
  #define PRO_VERSION_MIN 86
  #define PRO_VERSION_MAX 101
@@ -319,7 +319,8 @@ enum drbd_state_rv {
         SS_IN_TRANSIENT_STATE = -18,  /* Retry after the next state change */
         SS_CONCURRENT_ST_CHG = -19,   /* Concurrent cluster side state change! */
         SS_O_VOL_PEER_PRI = -20,
-       SS_AFTER_LAST_ERROR = -21,    /* Keep this at bottom */
+       SS_OUTDATE_WO_CONN = -21,
+       SS_AFTER_LAST_ERROR = -22,    /* Keep this at bottom */
  };
  
  /* from drbd_strings.c */
diff --git a/include/linux/drbd_limits.h b/include/linux/drbd_limits.h

index 1fa19c5f5e64a0f03d8ffb98234983ae79b68e67..1fedf2b17cc801805581a185638fe97184be27b3 100644 (file)
--- a/include/linux/drbd_limits.h
+++ b/include/linux/drbd_limits.h
@@ -126,13 +126,12 @@
  #define DRBD_RESYNC_RATE_DEF 250
  #define DRBD_RESYNC_RATE_SCALE 'k'  /* kilobytes */
  
-  /* less than 7 would hit performance unnecessarily.
-   * 919 slots context information per transaction,
-   * 32k activity log, 4k transaction size,
-   * one transaction in flight:
-   * 919 * 7 = 6433 */
+  /* less than 7 would hit performance unnecessarily. */
  #define DRBD_AL_EXTENTS_MIN  7
-#define DRBD_AL_EXTENTS_MAX  6433
+  /* we use u16 as "slot number", (u16)~0 is "FREE".
+   * If you use >= 292 kB on-disk ring buffer,
+   * this is the maximum you can use: */
+#define DRBD_AL_EXTENTS_MAX  0xfffe
  #define DRBD_AL_EXTENTS_DEF  1237
  #define DRBD_AL_EXTENTS_SCALE '1'
  
diff --git a/include/linux/errno.h b/include/linux/errno.h

index f6bf082d4d4f3afdf4fedcd606c1d15d7120b84c..89627b9187f945b960a7e3612aad170bbe01b13e 100644 (file)
--- a/include/linux/errno.h
+++ b/include/linux/errno.h
@@ -28,6 +28,5 @@
  #define EBADTYPE       527     /* Type not supported by server */
  #define EJUKEBOX       528     /* Request initiated, but will not complete before timeout */
  #define EIOCBQUEUED    529     /* iocb queued, will get completion event */
-#define EIOCBRETRY     530     /* iocb queued, will trigger a retry */
  
  #endif
diff --git a/include/linux/f2fs_fs.h b/include/linux/f2fs_fs.h

index f9a12f6243a59dd8c8d3ba8467eabdc5bd866f3f..df6fab82f87e7650bafaa247523713fc3cc4be2f 100644 (file)
--- a/include/linux/f2fs_fs.h
+++ b/include/linux/f2fs_fs.h
@@ -139,7 +139,7 @@ struct f2fs_extent {
         __le32 len;             /* lengh of the extent */
  } __packed;
  
-#define F2FS_MAX_NAME_LEN      256
+#define F2FS_NAME_LEN          255
  #define ADDRS_PER_INODE         923    /* Address Pointers in an Inode */
  #define ADDRS_PER_BLOCK         1018   /* Address Pointers in a Direct Block */
  #define NIDS_PER_BLOCK          1018   /* Node IDs in an Indirect Block */
@@ -165,7 +165,8 @@ struct f2fs_inode {
         __le32 i_flags;                 /* file attributes */
         __le32 i_pino;                  /* parent inode number */
         __le32 i_namelen;               /* file name length */
-       __u8 i_name[F2FS_MAX_NAME_LEN]; /* file name for SPOR */
+       __u8 i_name[F2FS_NAME_LEN];     /* file name for SPOR */
+       __u8 i_reserved2;               /* for backward compatibility */
  
         struct f2fs_extent i_ext;       /* caching a largest extent */
  
@@ -362,10 +363,10 @@ struct f2fs_summary_block {
  typedef __le32 f2fs_hash_t;
  
  /* One directory entry slot covers 8bytes-long file name */
-#define F2FS_NAME_LEN          8
-#define F2FS_NAME_LEN_BITS     3
+#define F2FS_SLOT_LEN          8
+#define F2FS_SLOT_LEN_BITS     3
  
-#define GET_DENTRY_SLOTS(x)    ((x + F2FS_NAME_LEN - 1) >> F2FS_NAME_LEN_BITS)
+#define GET_DENTRY_SLOTS(x)    ((x + F2FS_SLOT_LEN - 1) >> F2FS_SLOT_LEN_BITS)
  
  /* the number of dentry in a block */
  #define NR_DENTRY_IN_BLOCK     214
@@ -377,10 +378,10 @@ typedef __le32    f2fs_hash_t;
  #define SIZE_OF_DENTRY_BITMAP  ((NR_DENTRY_IN_BLOCK + BITS_PER_BYTE - 1) / \
                                         BITS_PER_BYTE)
  #define SIZE_OF_RESERVED       (PAGE_SIZE - ((SIZE_OF_DIR_ENTRY + \
-                               F2FS_NAME_LEN) * \
+                               F2FS_SLOT_LEN) * \
                                 NR_DENTRY_IN_BLOCK + SIZE_OF_DENTRY_BITMAP))
  
-/* One directory entry slot representing F2FS_NAME_LEN-sized file name */
+/* One directory entry slot representing F2FS_SLOT_LEN-sized file name */
  struct f2fs_dir_entry {
         __le32 hash_code;       /* hash code of file name */
         __le32 ino;             /* inode number */
@@ -394,7 +395,7 @@ struct f2fs_dentry_block {
         __u8 dentry_bitmap[SIZE_OF_DENTRY_BITMAP];
         __u8 reserved[SIZE_OF_RESERVED];
         struct f2fs_dir_entry dentry[NR_DENTRY_IN_BLOCK];
-       __u8 filename[NR_DENTRY_IN_BLOCK][F2FS_NAME_LEN];
+       __u8 filename[NR_DENTRY_IN_BLOCK][F2FS_SLOT_LEN];
  } __packed;
  
  /* file types used in inode_info->flags */
diff --git a/include/linux/fs.h b/include/linux/fs.h

index b5a24ba83b6f17c7ae98bce197559a8d7e6ef60e..43db02e9c9fa11bed1b058fba9dd9039ea9c226d 100644 (file)
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -2091,7 +2091,7 @@ extern struct block_device *blkdev_get_by_path(const char *path, fmode_t mode,
                                                void *holder);
  extern struct block_device *blkdev_get_by_dev(dev_t dev, fmode_t mode,
                                               void *holder);
-extern int blkdev_put(struct block_device *bdev, fmode_t mode);
+extern void blkdev_put(struct block_device *bdev, fmode_t mode);
  #ifdef CONFIG_SYSFS
  extern int bd_link_disk_holder(struct block_device *bdev, struct gendisk *disk);
  extern void bd_unlink_disk_holder(struct block_device *bdev,
diff --git a/include/linux/gpio.h b/include/linux/gpio.h

index f6c7ae3e223b53914eac6ae4941a977d03597636..552e3f46e4a33f5b4915c86182c13cf0c876233a 100644 (file)
--- a/include/linux/gpio.h
+++ b/include/linux/gpio.h
@@ -39,7 +39,7 @@ struct gpio {
         const char      *label;
  };
  
-#ifdef CONFIG_GENERIC_GPIO
+#ifdef CONFIG_GPIOLIB
  
  #ifdef CONFIG_ARCH_HAVE_CUSTOM_GPIO_H
  #include <asm/gpio.h>
@@ -74,7 +74,7 @@ static inline int irq_to_gpio(unsigned int irq)
  
  #endif /* ! CONFIG_ARCH_HAVE_CUSTOM_GPIO_H */
  
-#else /* ! CONFIG_GENERIC_GPIO */
+#else /* ! CONFIG_GPIOLIB */
  
  #include <linux/kernel.h>
  #include <linux/types.h>
@@ -226,7 +226,7 @@ gpiochip_remove_pin_ranges(struct gpio_chip *chip)
         WARN_ON(1);
  }
  
-#endif /* ! CONFIG_GENERIC_GPIO */
+#endif /* ! CONFIG_GPIOLIB */
  
  struct device;
  
diff --git a/include/linux/hugetlb.h b/include/linux/hugetlb.h

index 3a62df310f2effdd71317684589c9ff92570195e..6b4890fa57e7191574da1efe41eec956077d72ae 100644 (file)
--- a/include/linux/hugetlb.h
+++ b/include/linux/hugetlb.h
@@ -189,8 +189,7 @@ static inline struct hugetlbfs_sb_info *HUGETLBFS_SB(struct super_block *sb)
  
  extern const struct file_operations hugetlbfs_file_operations;
  extern const struct vm_operations_struct hugetlb_vm_ops;
-struct file *hugetlb_file_setup(const char *name, unsigned long addr,
-                               size_t size, vm_flags_t acct,
+struct file *hugetlb_file_setup(const char *name, size_t size, vm_flags_t acct,
                                 struct user_struct **user, int creat_flags,
                                 int page_size_log);
  
@@ -209,8 +208,8 @@ static inline int is_file_hugepages(struct file *file)
  
  #define is_file_hugepages(file)                        0
  static inline struct file *
-hugetlb_file_setup(const char *name, unsigned long addr, size_t size,
-               vm_flags_t acctflag, struct user_struct **user, int creat_flags,
+hugetlb_file_setup(const char *name, size_t size, vm_flags_t acctflag,
+               struct user_struct **user, int creat_flags,
                 int page_size_log)
  {
         return ERR_PTR(-ENOSYS);
@@ -288,6 +287,13 @@ static inline struct hstate *hstate_file(struct file *f)
         return hstate_inode(file_inode(f));
  }
  
+static inline struct hstate *hstate_sizelog(int page_size_log)
+{
+       if (!page_size_log)
+               return &default_hstate;
+       return size_to_hstate(1 << page_size_log);
+}
+
  static inline struct hstate *hstate_vma(struct vm_area_struct *vma)
  {
         return hstate_file(vma->vm_file);
@@ -352,11 +358,12 @@ static inline int hstate_index(struct hstate *h)
         return h - hstates;
  }
  
-#else
+#else  /* CONFIG_HUGETLB_PAGE */
  struct hstate {};
  #define alloc_huge_page_node(h, nid) NULL
  #define alloc_bootmem_huge_page(h) NULL
  #define hstate_file(f) NULL
+#define hstate_sizelog(s) NULL
  #define hstate_vma(v) NULL
  #define hstate_inode(i) NULL
  #define huge_page_size(h) PAGE_SIZE
@@ -371,6 +378,6 @@ static inline unsigned int pages_per_huge_page(struct hstate *h)
  }
  #define hstate_index_to_shift(index) 0
  #define hstate_index(h) 0
-#endif
+#endif /* CONFIG_HUGETLB_PAGE */
  
  #endif /* _LINUX_HUGETLB_H */
diff --git a/include/linux/idr.h b/include/linux/idr.h

index a470ac3ef49d426abd82f26e62ca6e2b951f67f3..871a213a8477eb5f1d7c6ad3318f6b796bac7106 100644 (file)
--- a/include/linux/idr.h
+++ b/include/linux/idr.h
@@ -124,11 +124,13 @@ static inline void *idr_find(struct idr *idr, int id)
   * @idp:     idr handle
   * @entry:   the type * to use as cursor
   * @id:      id entry's key
+ *
+ * @entry and @id do not need to be initialized before the loop, and
+ * after normal terminatinon @entry is left with the value NULL.  This
+ * is convenient for a "not found" value.
   */
-#define idr_for_each_entry(idp, entry, id)                             \
-       for (id = 0, entry = (typeof(entry))idr_get_next((idp), &(id)); \
-            entry != NULL;                                             \
-            ++id, entry = (typeof(entry))idr_get_next((idp), &(id)))
+#define idr_for_each_entry(idp, entry, id)                     \
+       for (id = 0; ((entry) = idr_get_next(idp, &(id))) != NULL; ++id)
  
  /*
   * Don't use the following functions.  These exist only to suppress
diff --git a/include/linux/kref.h b/include/linux/kref.h

index 4972e6e9ca93e5e4f2388b70bfe2c6de661af246..e15828fd71f1b589780b933549e3b07c9652c1e9 100644 (file)
--- a/include/linux/kref.h
+++ b/include/linux/kref.h
@@ -39,8 +39,11 @@ static inline void kref_init(struct kref *kref)
   */
  static inline void kref_get(struct kref *kref)
  {
-       WARN_ON(!atomic_read(&kref->refcount));
-       atomic_inc(&kref->refcount);
+       /* If refcount was 0 before incrementing then we have a race
+        * condition when this kref is freeing by some other thread right now.
+        * In this case one should use kref_get_unless_zero()
+        */
+       WARN_ON_ONCE(atomic_inc_return(&kref->refcount) < 2);
  }
  
  /**
@@ -100,7 +103,7 @@ static inline int kref_put_mutex(struct kref *kref,
                                  struct mutex *lock)
  {
         WARN_ON(release == NULL);
-        if (unlikely(!atomic_add_unless(&kref->refcount, -1, 1))) {
+       if (unlikely(!atomic_add_unless(&kref->refcount, -1, 1))) {
                 mutex_lock(lock);
                 if (unlikely(!atomic_dec_and_test(&kref->refcount))) {
                         mutex_unlock(lock);
diff --git a/include/linux/lru_cache.h b/include/linux/lru_cache.h

index 4019013c6593b90adebe0760b344e57a76afaa50..46262284de478196e4e907674de9e5e04f364d60 100644 (file)
--- a/include/linux/lru_cache.h
+++ b/include/linux/lru_cache.h
@@ -256,6 +256,7 @@ extern void lc_destroy(struct lru_cache *lc);
  extern void lc_set(struct lru_cache *lc, unsigned int enr, int index);
  extern void lc_del(struct lru_cache *lc, struct lc_element *element);
  
+extern struct lc_element *lc_get_cumulative(struct lru_cache *lc, unsigned int enr);
  extern struct lc_element *lc_try_get(struct lru_cache *lc, unsigned int enr);
  extern struct lc_element *lc_find(struct lru_cache *lc, unsigned int enr);
  extern struct lc_element *lc_get(struct lru_cache *lc, unsigned int enr);
diff --git a/include/linux/mlx4/device.h b/include/linux/mlx4/device.h

index 53acaf64189f7bce45b238d51458ddad9a03db0b..a51b0134ce18c6bc4c15e3ac0fb898243d51bb86 100644 (file)
--- a/include/linux/mlx4/device.h
+++ b/include/linux/mlx4/device.h
@@ -903,11 +903,12 @@ static inline int map_hw_to_sw_id(u16 header_id)
  }
  
  enum mlx4_net_trans_promisc_mode {
-       MLX4_FS_PROMISC_NONE = 0,
-       MLX4_FS_PROMISC_UPLINK,
-       /* For future use. Not implemented yet */
-       MLX4_FS_PROMISC_FUNCTION_PORT,
-       MLX4_FS_PROMISC_ALL_MULTI,
+       MLX4_FS_REGULAR = 1,
+       MLX4_FS_ALL_DEFAULT,
+       MLX4_FS_MC_DEFAULT,
+       MLX4_FS_UC_SNIFFER,
+       MLX4_FS_MC_SNIFFER,
+       MLX4_FS_MODE_NUM, /* should be last */
  };
  
  struct mlx4_spec_eth {
@@ -936,7 +937,7 @@ struct mlx4_spec_ipv4 {
  };
  
  struct mlx4_spec_ib {
-       __be32  r_qpn;
+       __be32  l3_qpn;
         __be32  qpn_msk;
         u8      dst_gid[16];
         u8      dst_gid_msk[16];
@@ -969,6 +970,92 @@ struct mlx4_net_trans_rule {
         u32     qpn;
  };
  
+struct mlx4_net_trans_rule_hw_ctrl {
+       __be16 prio;
+       u8 type;
+       u8 flags;
+       u8 rsvd1;
+       u8 funcid;
+       u8 vep;
+       u8 port;
+       __be32 qpn;
+       __be32 rsvd2;
+};
+
+struct mlx4_net_trans_rule_hw_ib {
+       u8 size;
+       u8 rsvd1;
+       __be16 id;
+       u32 rsvd2;
+       __be32 l3_qpn;
+       __be32 qpn_mask;
+       u8 dst_gid[16];
+       u8 dst_gid_msk[16];
+} __packed;
+
+struct mlx4_net_trans_rule_hw_eth {
+       u8      size;
+       u8      rsvd;
+       __be16  id;
+       u8      rsvd1[6];
+       u8      dst_mac[6];
+       u16     rsvd2;
+       u8      dst_mac_msk[6];
+       u16     rsvd3;
+       u8      src_mac[6];
+       u16     rsvd4;
+       u8      src_mac_msk[6];
+       u8      rsvd5;
+       u8      ether_type_enable;
+       __be16  ether_type;
+       __be16  vlan_tag_msk;
+       __be16  vlan_tag;
+} __packed;
+
+struct mlx4_net_trans_rule_hw_tcp_udp {
+       u8      size;
+       u8      rsvd;
+       __be16  id;
+       __be16  rsvd1[3];
+       __be16  dst_port;
+       __be16  rsvd2;
+       __be16  dst_port_msk;
+       __be16  rsvd3;
+       __be16  src_port;
+       __be16  rsvd4;
+       __be16  src_port_msk;
+} __packed;
+
+struct mlx4_net_trans_rule_hw_ipv4 {
+       u8      size;
+       u8      rsvd;
+       __be16  id;
+       __be32  rsvd1;
+       __be32  dst_ip;
+       __be32  dst_ip_msk;
+       __be32  src_ip;
+       __be32  src_ip_msk;
+} __packed;
+
+struct _rule_hw {
+       union {
+               struct {
+                       u8 size;
+                       u8 rsvd;
+                       __be16 id;
+               };
+               struct mlx4_net_trans_rule_hw_eth eth;
+               struct mlx4_net_trans_rule_hw_ib ib;
+               struct mlx4_net_trans_rule_hw_ipv4 ipv4;
+               struct mlx4_net_trans_rule_hw_tcp_udp tcp_udp;
+       };
+};
+
+/* translating DMFS verbs sniffer rule to the FW API would need two reg IDs */
+struct mlx4_flow_handle {
+       u64 reg_id[2];
+};
+
  int mlx4_flow_steer_promisc_add(struct mlx4_dev *dev, u8 port, u32 qpn,
                                 enum mlx4_net_trans_promisc_mode mode);
  int mlx4_flow_steer_promisc_remove(struct mlx4_dev *dev, u8 port,
@@ -1018,6 +1105,11 @@ void mlx4_counter_free(struct mlx4_dev *dev, u32 idx);
  int mlx4_flow_attach(struct mlx4_dev *dev,
                      struct mlx4_net_trans_rule *rule, u64 *reg_id);
  int mlx4_flow_detach(struct mlx4_dev *dev, u64 reg_id);
+int mlx4_map_sw_to_hw_steering_mode(struct mlx4_dev *dev,
+                                   enum mlx4_net_trans_promisc_mode flow_type);
+int mlx4_map_sw_to_hw_steering_id(struct mlx4_dev *dev,
+                                 enum mlx4_net_trans_rule_id id);
+int mlx4_hw_rule_sz(struct mlx4_dev *dev, enum mlx4_net_trans_rule_id id);
  
  void mlx4_sync_pkey_table(struct mlx4_dev *dev, int slave, int port,
                           int i, int val);
diff --git a/include/linux/mlx4/srq.h b/include/linux/mlx4/srq.h

index 799a0697a3835a751c4a63e02f82cf871d27c342..192e0f7784f2868e647eeb7b49abc15e7bab4639 100644 (file)
--- a/include/linux/mlx4/srq.h
+++ b/include/linux/mlx4/srq.h
@@ -39,4 +39,6 @@ struct mlx4_wqe_srq_next_seg {
         u32                     reserved2[3];
  };
  
+struct mlx4_srq *mlx4_srq_lookup(struct mlx4_dev *dev, u32 srqn);
+
  #endif /* MLX4_SRQ_H */
diff --git a/include/linux/mm.h b/include/linux/mm.h

index 1a7f19e7f1a0f2c3bd798fb032cd16af9978585b..e0c8528a41a4d4a278fe736a46341755d46c5479 100644 (file)
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -951,13 +951,19 @@ void unmap_vmas(struct mmu_gather *tlb, struct vm_area_struct *start_vma,
   * (see walk_page_range for more details)
   */
  struct mm_walk {
-       int (*pgd_entry)(pgd_t *, unsigned long, unsigned long, struct mm_walk *);
-       int (*pud_entry)(pud_t *, unsigned long, unsigned long, struct mm_walk *);
-       int (*pmd_entry)(pmd_t *, unsigned long, unsigned long, struct mm_walk *);
-       int (*pte_entry)(pte_t *, unsigned long, unsigned long, struct mm_walk *);
-       int (*pte_hole)(unsigned long, unsigned long, struct mm_walk *);
-       int (*hugetlb_entry)(pte_t *, unsigned long,
-                            unsigned long, unsigned long, struct mm_walk *);
+       int (*pgd_entry)(pgd_t *pgd, unsigned long addr,
+                        unsigned long next, struct mm_walk *walk);
+       int (*pud_entry)(pud_t *pud, unsigned long addr,
+                        unsigned long next, struct mm_walk *walk);
+       int (*pmd_entry)(pmd_t *pmd, unsigned long addr,
+                        unsigned long next, struct mm_walk *walk);
+       int (*pte_entry)(pte_t *pte, unsigned long addr,
+                        unsigned long next, struct mm_walk *walk);
+       int (*pte_hole)(unsigned long addr, unsigned long next,
+                       struct mm_walk *walk);
+       int (*hugetlb_entry)(pte_t *pte, unsigned long hmask,
+                            unsigned long addr, unsigned long next,
+                            struct mm_walk *walk);
         struct mm_struct *mm;
         void *private;
  };
diff --git a/include/linux/mtd/blktrans.h b/include/linux/mtd/blktrans.h

index 4eb0a50d0c55583454d1842d039dacdb648b7b5e..e93837f647dea52fe359318899f6e7893c78bc20 100644 (file)
--- a/include/linux/mtd/blktrans.h
+++ b/include/linux/mtd/blktrans.h
@@ -74,7 +74,7 @@ struct mtd_blktrans_ops {
  
         /* Called with mtd_table_mutex held; no race with add/remove */
         int (*open)(struct mtd_blktrans_dev *dev);
-       int (*release)(struct mtd_blktrans_dev *dev);
+       void (*release)(struct mtd_blktrans_dev *dev);
  
         /* Called on {de,}registration and on subsequent addition/removal
            of devices, with mtd_table_mutex held. */
diff --git a/include/linux/mtd/mtd.h b/include/linux/mtd/mtd.h

index f9ac2897b86b040f0143b9bbc3d10bb9699f6cd6..a5cf4e8d68187e5bd5e7892effc0957dc5bfea0b 100644 (file)
--- a/include/linux/mtd/mtd.h
+++ b/include/linux/mtd/mtd.h
@@ -362,10 +362,10 @@ struct mtd_partition;
  struct mtd_part_parser_data;
  
  extern int mtd_device_parse_register(struct mtd_info *mtd,
-                             const char **part_probe_types,
-                             struct mtd_part_parser_data *parser_data,
-                             const struct mtd_partition *defparts,
-                             int defnr_parts);
+                                    const char * const *part_probe_types,
+                                    struct mtd_part_parser_data *parser_data,
+                                    const struct mtd_partition *defparts,
+                                    int defnr_parts);
  #define mtd_device_register(master, parts, nr_parts)   \
         mtd_device_parse_register(master, NULL, NULL, parts, nr_parts)
  extern int mtd_device_unregister(struct mtd_info *master);
diff --git a/include/linux/mtd/nand.h b/include/linux/mtd/nand.h

index ef52d9c91459e0204e31c98483922b7ab95b1ebb..ab6363443ce81f033b641b014102a2dfdf8921be 100644 (file)
--- a/include/linux/mtd/nand.h
+++ b/include/linux/mtd/nand.h
@@ -86,7 +86,6 @@ extern int nand_unlock(struct mtd_info *mtd, loff_t ofs, uint64_t len);
  #define NAND_CMD_READOOB       0x50
  #define NAND_CMD_ERASE1                0x60
  #define NAND_CMD_STATUS                0x70
-#define NAND_CMD_STATUS_MULTI  0x71
  #define NAND_CMD_SEQIN         0x80
  #define NAND_CMD_RNDIN         0x85
  #define NAND_CMD_READID                0x90
@@ -105,25 +104,6 @@ extern int nand_unlock(struct mtd_info *mtd, loff_t ofs, uint64_t len);
  #define NAND_CMD_RNDOUTSTART   0xE0
  #define NAND_CMD_CACHEDPROG    0x15
  
-/* Extended commands for AG-AND device */
-/*
- * Note: the command for NAND_CMD_DEPLETE1 is really 0x00 but
- *       there is no way to distinguish that from NAND_CMD_READ0
- *       until the remaining sequence of commands has been completed
- *       so add a high order bit and mask it off in the command.
- */
-#define NAND_CMD_DEPLETE1      0x100
-#define NAND_CMD_DEPLETE2      0x38
-#define NAND_CMD_STATUS_MULTI  0x71
-#define NAND_CMD_STATUS_ERROR  0x72
-/* multi-bank error status (banks 0-3) */
-#define NAND_CMD_STATUS_ERROR0 0x73
-#define NAND_CMD_STATUS_ERROR1 0x74
-#define NAND_CMD_STATUS_ERROR2 0x75
-#define NAND_CMD_STATUS_ERROR3 0x76
-#define NAND_CMD_STATUS_RESET  0x7f
-#define NAND_CMD_STATUS_CLEAR  0xff
-
  #define NAND_CMD_NONE          -1
  
  /* Status bits */
@@ -165,28 +145,8 @@ typedef enum {
   */
  /* Buswidth is 16 bit */
  #define NAND_BUSWIDTH_16       0x00000002
-/* Device supports partial programming without padding */
-#define NAND_NO_PADDING                0x00000004
  /* Chip has cache program function */
  #define NAND_CACHEPRG          0x00000008
-/* Chip has copy back function */
-#define NAND_COPYBACK          0x00000010
-/*
- * AND Chip which has 4 banks and a confusing page / block
- * assignment. See Renesas datasheet for further information.
- */
-#define NAND_IS_AND            0x00000020
-/*
- * Chip has a array of 4 pages which can be read without
- * additional ready /busy waits.
- */
-#define NAND_4PAGE_ARRAY       0x00000040
-/*
- * Chip requires that BBT is periodically rewritten to prevent
- * bits from adjacent blocks from 'leaking' in altering data.
- * This happens with the Renesas AG-AND chips, possibly others.
- */
-#define BBT_AUTO_REFRESH       0x00000080
  /*
   * Chip requires ready check on read (for auto-incremented sequential read).
   * True only for small page devices; large page devices do not support
@@ -207,13 +167,10 @@ typedef enum {
  #define NAND_SUBPAGE_READ      0x00001000
  
  /* Options valid for Samsung large page devices */
-#define NAND_SAMSUNG_LP_OPTIONS \
-       (NAND_NO_PADDING | NAND_CACHEPRG | NAND_COPYBACK)
+#define NAND_SAMSUNG_LP_OPTIONS NAND_CACHEPRG
  
  /* Macros to identify the above */
-#define NAND_MUST_PAD(chip) (!(chip->options & NAND_NO_PADDING))
  #define NAND_HAS_CACHEPROG(chip) ((chip->options & NAND_CACHEPRG))
-#define NAND_HAS_COPYBACK(chip) ((chip->options & NAND_COPYBACK))
  #define NAND_HAS_SUBPAGE_READ(chip) ((chip->options & NAND_SUBPAGE_READ))
  
  /* Non chip related options */
@@ -361,6 +318,7 @@ struct nand_hw_control {
   *             any single ECC step, 0 if bitflips uncorrectable, -EIO hw error
   * @read_subpage:      function to read parts of the page covered by ECC;
   *                     returns same as read_page()
+ * @write_subpage:     function to write parts of the page covered by ECC.
   * @write_page:        function to write a page according to the ECC generator
   *             requirements.
   * @write_oob_raw:     function to write chip OOB data without ECC
@@ -392,6 +350,9 @@ struct nand_ecc_ctrl {
                         uint8_t *buf, int oob_required, int page);
         int (*read_subpage)(struct mtd_info *mtd, struct nand_chip *chip,
                         uint32_t offs, uint32_t len, uint8_t *buf);
+       int (*write_subpage)(struct mtd_info *mtd, struct nand_chip *chip,
+                       uint32_t offset, uint32_t data_len,
+                       const uint8_t *data_buf, int oob_required);
         int (*write_page)(struct mtd_info *mtd, struct nand_chip *chip,
                         const uint8_t *buf, int oob_required);
         int (*write_oob_raw)(struct mtd_info *mtd, struct nand_chip *chip,
@@ -527,8 +488,8 @@ struct nand_chip {
         int (*errstat)(struct mtd_info *mtd, struct nand_chip *this, int state,
                         int status, int page);
         int (*write_page)(struct mtd_info *mtd, struct nand_chip *chip,
-                       const uint8_t *buf, int oob_required, int page,
-                       int cached, int raw);
+                       uint32_t offset, int data_len, const uint8_t *buf,
+                       int oob_required, int page, int cached, int raw);
         int (*onfi_set_features)(struct mtd_info *mtd, struct nand_chip *chip,
                         int feature_addr, uint8_t *subfeature_para);
         int (*onfi_get_features)(struct mtd_info *mtd, struct nand_chip *chip,
@@ -589,25 +550,65 @@ struct nand_chip {
  #define NAND_MFR_MACRONIX      0xc2
  #define NAND_MFR_EON           0x92
  
+/* The maximum expected count of bytes in the NAND ID sequence */
+#define NAND_MAX_ID_LEN 8
+
+/*
+ * A helper for defining older NAND chips where the second ID byte fully
+ * defined the chip, including the geometry (chip size, eraseblock size, page
+ * size). All these chips have 512 bytes NAND page size.
+ */
+#define LEGACY_ID_NAND(nm, devid, chipsz, erasesz, opts)          \
+       { .name = (nm), {{ .dev_id = (devid) }}, .pagesize = 512, \
+         .chipsize = (chipsz), .erasesize = (erasesz), .options = (opts) }
+
+/*
+ * A helper for defining newer chips which report their page size and
+ * eraseblock size via the extended ID bytes.
+ *
+ * The real difference between LEGACY_ID_NAND and EXTENDED_ID_NAND is that with
+ * EXTENDED_ID_NAND, manufacturers overloaded the same device ID so that the
+ * device ID now only represented a particular total chip size (and voltage,
+ * buswidth), and the page size, eraseblock size, and OOB size could vary while
+ * using the same device ID.
+ */
+#define EXTENDED_ID_NAND(nm, devid, chipsz, opts)                      \
+       { .name = (nm), {{ .dev_id = (devid) }}, .chipsize = (chipsz), \
+         .options = (opts) }
+
  /**
   * struct nand_flash_dev - NAND Flash Device ID Structure
- * @name:      Identify the device type
- * @id:                device ID code
- * @pagesize:  Pagesize in bytes. Either 256 or 512 or 0
- *             If the pagesize is 0, then the real pagesize
- *             and the eraseize are determined from the
- *             extended id bytes in the chip
- * @erasesize: Size of an erase block in the flash device.
- * @chipsize:  Total chipsize in Mega Bytes
- * @options:   Bitfield to store chip relevant options
+ * @name: a human-readable name of the NAND chip
+ * @dev_id: the device ID (the second byte of the full chip ID array)
+ * @mfr_id: manufecturer ID part of the full chip ID array (refers the same
+ *          memory address as @id[0])
+ * @dev_id: device ID part of the full chip ID array (refers the same memory
+ *          address as @id[1])
+ * @id: full device ID array
+ * @pagesize: size of the NAND page in bytes; if 0, then the real page size (as
+ *            well as the eraseblock size) is determined from the extended NAND
+ *            chip ID array)
+ * @chipsize: total chip size in MiB
+ * @erasesize: eraseblock size in bytes (determined from the extended ID if 0)
+ * @options: stores various chip bit options
+ * @id_len: The valid length of the @id.
+ * @oobsize: OOB size
   */
  struct nand_flash_dev {
         char *name;
-       int id;
-       unsigned long pagesize;
-       unsigned long chipsize;
-       unsigned long erasesize;
-       unsigned long options;
+       union {
+               struct {
+                       uint8_t mfr_id;
+                       uint8_t dev_id;
+               };
+               uint8_t id[NAND_MAX_ID_LEN];
+       };
+       unsigned int pagesize;
+       unsigned int chipsize;
+       unsigned int erasesize;
+       unsigned int options;
+       uint16_t id_len;
+       uint16_t oobsize;
  };
  
  /**
diff --git a/include/linux/mtd/physmap.h b/include/linux/mtd/physmap.h

index d2887e76b7f66845b3662c9c52b80b211b78f38b..aa6a2633c2da13697a0a969ab320973756f155b3 100644 (file)
--- a/include/linux/mtd/physmap.h
+++ b/include/linux/mtd/physmap.h
@@ -30,7 +30,7 @@ struct physmap_flash_data {
         unsigned int            pfow_base;
         char                    *probe_type;
         struct mtd_partition    *parts;
-       const char              **part_probe_types;
+       const char * const      *part_probe_types;
  };
  
  #endif /* __LINUX_MTD_PHYSMAP__ */
diff --git a/include/linux/mtd/plat-ram.h b/include/linux/mtd/plat-ram.h

index e07890aff1cf9523cb0fe4d07b0c06353da046c9..44212d65aa9760d3da44aec47fc7139e021a704d 100644 (file)
--- a/include/linux/mtd/plat-ram.h
+++ b/include/linux/mtd/plat-ram.h
@@ -20,8 +20,8 @@
  
  struct platdata_mtd_ram {
         const char              *mapname;
-       const char              **map_probes;
-       const char              **probes;
+       const char * const      *map_probes;
+       const char * const      *probes;
         struct mtd_partition    *partitions;
         int                      nr_partitions;
         int                      bankwidth;
diff --git a/include/linux/nfs_xdr.h b/include/linux/nfs_xdr.h

index 766c5bc9d441ccf440341dab53e06616e3262363..104b62f23ee025a51d730228e061ea49ce4513e0 100644 (file)
--- a/include/linux/nfs_xdr.h
+++ b/include/linux/nfs_xdr.h
@@ -1176,7 +1176,7 @@ struct nfs41_test_stateid_res {
  
  struct nfs41_free_stateid_args {
         struct nfs4_sequence_args       seq_args;
-       nfs4_stateid                    *stateid;
+       nfs4_stateid                    stateid;
  };
  
  struct nfs41_free_stateid_res {
diff --git a/include/linux/of_dma.h b/include/linux/of_dma.h

index d15073e080dd7130458c971c26d0a462746046a6..364dda734877d2b9a1d045d8082de2542420a475 100644 (file)
--- a/include/linux/of_dma.h
+++ b/include/linux/of_dma.h
@@ -25,7 +25,6 @@ struct of_dma {
         struct dma_chan         *(*of_dma_xlate)
                                 (struct of_phandle_args *, struct of_dma *);
         void                    *of_dma_data;
-       int                     use_count;
  };
  
  struct of_dma_filter_info {
@@ -38,9 +37,9 @@ extern int of_dma_controller_register(struct device_node *np,
                 struct dma_chan *(*of_dma_xlate)
                 (struct of_phandle_args *, struct of_dma *),
                 void *data);
-extern int of_dma_controller_free(struct device_node *np);
+extern void of_dma_controller_free(struct device_node *np);
  extern struct dma_chan *of_dma_request_slave_channel(struct device_node *np,
-                                                    char *name);
+                                                    const char *name);
  extern struct dma_chan *of_dma_simple_xlate(struct of_phandle_args *dma_spec,
                 struct of_dma *ofdma);
  #else
@@ -52,13 +51,12 @@ static inline int of_dma_controller_register(struct device_node *np,
         return -ENODEV;
  }
  
-static inline int of_dma_controller_free(struct device_node *np)
+static inline void of_dma_controller_free(struct device_node *np)
  {
-       return -ENODEV;
  }
  
  static inline struct dma_chan *of_dma_request_slave_channel(struct device_node *np,
-                                                    char *name)
+                                                    const char *name)
  {
         return NULL;
  }
diff --git a/include/linux/pid_namespace.h b/include/linux/pid_namespace.h

index 731e4ecee3bd59186b780253bb3bde660593a2f5..e2772666f004ef2d83b3493fb6f93b2625fb12f4 100644 (file)
--- a/include/linux/pid_namespace.h
+++ b/include/linux/pid_namespace.h
@@ -4,6 +4,7 @@
  #include <linux/sched.h>
  #include <linux/bug.h>
  #include <linux/mm.h>
+#include <linux/workqueue.h>
  #include <linux/threads.h>
  #include <linux/nsproxy.h>
  #include <linux/kref.h>
diff --git a/include/linux/platform_data/elm.h b/include/linux/platform_data/elm.h

index 1bd5244d1dcd0720d878de32059867f125ab803e..bf0a83b7ed9d3e0538cc09c57f7d498065931501 100644 (file)
--- a/include/linux/platform_data/elm.h
+++ b/include/linux/platform_data/elm.h
@@ -50,5 +50,5 @@ struct elm_errorvec {
  
  void elm_decode_bch_error_page(struct device *dev, u8 *ecc_calc,
                 struct elm_errorvec *err_vec);
-void elm_config(struct device *dev, enum bch_ecc bch_type);
+int elm_config(struct device *dev, enum bch_ecc bch_type);
  #endif /* __ELM_H */
diff --git a/include/linux/platform_data/imx-iram.h b/include/linux/platform_data/imx-iram.h

deleted file mode 100644 (file)

index 022690c..0000000
--- a/include/linux/platform_data/imx-iram.h
+++ /dev/null
@@ -1,41 +0,0 @@
-/*
- * Copyright (C) 2010 Freescale Semiconductor, Inc. All Rights Reserved.
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public License
- * as published by the Free Software Foundation; either version 2
- * of the License, or (at your option) any later version.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, write to the Free Software
- * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston,
- * MA 02110-1301, USA.
- */
-#include <linux/errno.h>
-
-#ifdef CONFIG_IRAM_ALLOC
-
-int __init iram_init(unsigned long base, unsigned long size);
-void __iomem *iram_alloc(unsigned int size, unsigned long *dma_addr);
-void iram_free(unsigned long dma_addr, unsigned int size);
-
-#else
-
-static inline int __init iram_init(unsigned long base, unsigned long size)
-{
-       return -ENOMEM;
-}
-
-static inline void __iomem *iram_alloc(unsigned int size, unsigned long *dma_addr)
-{
-       return NULL;
-}
-
-static inline void iram_free(unsigned long base, unsigned long size) {}
-
-#endif
diff --git a/include/linux/random.h b/include/linux/random.h

index 347ce553a3065fd5020eb1ab864434db488df27c..3b9377d6b7a5fd63b13d02fc238d7da99fbef026 100644 (file)
--- a/include/linux/random.h
+++ b/include/linux/random.h
@@ -29,13 +29,6 @@ u32 prandom_u32(void);
  void prandom_bytes(void *buf, int nbytes);
  void prandom_seed(u32 seed);
  
-/*
- * These macros are preserved for backward compatibility and should be
- * removed as soon as a transition is finished.
- */
-#define random32() prandom_u32()
-#define srandom32(seed) prandom_seed(seed)
-
  u32 prandom_u32_state(struct rnd_state *);
  void prandom_bytes_state(struct rnd_state *state, void *buf, int nbytes);
  
diff --git a/include/linux/remoteproc.h b/include/linux/remoteproc.h

index faf33324c78f9613526a66474670303ae5661aab..9e7e745dac55a64ce513d2e4807bc045aa9a8b14 100644 (file)
--- a/include/linux/remoteproc.h
+++ b/include/linux/remoteproc.h
@@ -401,6 +401,9 @@ enum rproc_crash_type {
   * @crash_comp: completion used to sync crash handler and the rproc reload
   * @recovery_disabled: flag that state if recovery was disabled
   * @max_notifyid: largest allocated notify id.
+ * @table_ptr: pointer to the resource table in effect
+ * @cached_table: copy of the resource table
+ * @table_csum: checksum of the resource table
   */
  struct rproc {
         struct klist_node node;
@@ -429,9 +432,13 @@ struct rproc {
         struct completion crash_comp;
         bool recovery_disabled;
         int max_notifyid;
+       struct resource_table *table_ptr;
+       struct resource_table *cached_table;
+       u32 table_csum;
  };
  
  /* we currently support only two vrings per rvdev */
+
  #define RVDEV_NUM_VRINGS 2
  
  /**
@@ -462,16 +469,14 @@ struct rproc_vring {
   * @rproc: the rproc handle
   * @vdev: the virio device
   * @vring: the vrings for this vdev
- * @dfeatures: virtio device features
- * @gfeatures: virtio guest features
+ * @rsc_offset: offset of the vdev's resource entry
   */
  struct rproc_vdev {
         struct list_head node;
         struct rproc *rproc;
         struct virtio_device vdev;
         struct rproc_vring vring[RVDEV_NUM_VRINGS];
-       unsigned long dfeatures;
-       unsigned long gfeatures;
+       u32 rsc_offset;
  };
  
  struct rproc *rproc_alloc(struct device *dev, const char *name,
diff --git a/include/linux/rwsem.h b/include/linux/rwsem.h

index 8da67d625e13fc888413847da4e2a7ae560b3d86..0616ffe45702f0c28fd20390ec9b8574c543032b 100644 (file)
--- a/include/linux/rwsem.h
+++ b/include/linux/rwsem.h
@@ -133,10 +133,20 @@ do {                                                              \
         _down_write_nest_lock(sem, &(nest_lock)->dep_map);      \
  } while (0);
  
+/*
+ * Take/release a lock when not the owner will release it.
+ *
+ * [ This API should be avoided as much as possible - the
+ *   proper abstraction for this case is completions. ]
+ */
+extern void down_read_non_owner(struct rw_semaphore *sem);
+extern void up_read_non_owner(struct rw_semaphore *sem);
  #else
  # define down_read_nested(sem, subclass)               down_read(sem)
  # define down_write_nest_lock(sem, nest_lock)  down_write(sem)
  # define down_write_nested(sem, subclass)      down_write(sem)
+# define down_read_non_owner(sem)              down_read(sem)
+# define up_read_non_owner(sem)                        up_read(sem)
  #endif
  
  #endif /* _LINUX_RWSEM_H */
diff --git a/include/linux/sched.h b/include/linux/sched.h

index 4800e9d1864c301bfe1650d97350bd7b7c8a2e2f..caa8f4d0186b742c3effc99c296ac99724f2f4d1 100644 (file)
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -313,8 +313,6 @@ extern void schedule_preempt_disabled(void);
  struct nsproxy;
  struct user_namespace;
  
-#include <linux/aio.h>
-
  #ifdef CONFIG_MMU
  extern void arch_pick_mmap_layout(struct mm_struct *mm);
  extern unsigned long
@@ -1413,6 +1411,10 @@ struct task_struct {
  #ifdef CONFIG_UPROBES
         struct uprobe_task *utask;
  #endif
+#if defined(CONFIG_BCACHE) || defined(CONFIG_BCACHE_MODULE)
+       unsigned int    sequential_io;
+       unsigned int    sequential_io_avg;
+#endif
  };
  
  /* Future-safe accessor for struct task_struct's cpus_allowed. */
diff --git a/include/linux/sudmac.h b/include/linux/sudmac.h

new file mode 100644 (file)

index 0000000..377b8a5
--- /dev/null
+++ b/include/linux/sudmac.h
@@ -0,0 +1,52 @@
+/*
+ * Header for the SUDMAC driver
+ *
+ * Copyright (C) 2013 Renesas Solutions Corp.
+ *
+ * This is free software; you can redistribute it and/or modify
+ * it under the terms of version 2 of the GNU General Public License as
+ * published by the Free Software Foundation.
+ */
+#ifndef SUDMAC_H
+#define SUDMAC_H
+
+#include <linux/dmaengine.h>
+#include <linux/shdma-base.h>
+#include <linux/types.h>
+
+/* Used by slave DMA clients to request DMA to/from a specific peripheral */
+struct sudmac_slave {
+       struct shdma_slave      shdma_slave;    /* Set by the platform */
+};
+
+/*
+ * Supplied by platforms to specify, how a DMA channel has to be configured for
+ * a certain peripheral
+ */
+struct sudmac_slave_config {
+       int             slave_id;
+};
+
+struct sudmac_channel {
+       unsigned long   offset;
+       unsigned long   config;
+       unsigned long   wait;           /* The configuable range is 0 to 3 */
+       unsigned long   dint_end_bit;
+};
+
+struct sudmac_pdata {
+       const struct sudmac_slave_config *slave;
+       int slave_num;
+       const struct sudmac_channel *channel;
+       int channel_num;
+};
+
+/* Definitions for the sudmac_channel.config */
+#define SUDMAC_TX_BUFFER_MODE  BIT(0)
+#define SUDMAC_RX_END_MODE     BIT(1)
+
+/* Definitions for the sudmac_channel.dint_end_bit */
+#define SUDMAC_DMA_BIT_CH0     BIT(0)
+#define SUDMAC_DMA_BIT_CH1     BIT(1)
+
+#endif
diff --git a/include/linux/thermal.h b/include/linux/thermal.h

index e3c0ae9bb1faf876afca191701e481ecc30a4f1b..a386a1cbb6e1c912667ef7433ea608a003832940 100644 (file)
--- a/include/linux/thermal.h
+++ b/include/linux/thermal.h
@@ -33,8 +33,11 @@
  #define THERMAL_MAX_TRIPS      12
  #define THERMAL_NAME_LENGTH    20
  
+/* invalid cooling state */
+#define THERMAL_CSTATE_INVALID -1UL
+
  /* No upper/lower limit requirement */
-#define THERMAL_NO_LIMIT       -1UL
+#define THERMAL_NO_LIMIT       THERMAL_CSTATE_INVALID
  
  /* Unit conversion macros */
  #define KELVIN_TO_CELSIUS(t)   (long)(((long)t-2732 >= 0) ?    \
@@ -184,7 +187,6 @@ struct thermal_governor {
         char name[THERMAL_NAME_LENGTH];
         int (*throttle)(struct thermal_zone_device *tz, int trip);
         struct list_head        governor_list;
-       struct module           *owner;
  };
  
  /* Structure that holds binding parameters for a zone */
@@ -237,21 +239,20 @@ void thermal_zone_device_update(struct thermal_zone_device *);
  struct thermal_cooling_device *thermal_cooling_device_register(char *, void *,
                 const struct thermal_cooling_device_ops *);
  void thermal_cooling_device_unregister(struct thermal_cooling_device *);
+struct thermal_zone_device *thermal_zone_get_zone_by_name(const char *name);
+int thermal_zone_get_temp(struct thermal_zone_device *tz, unsigned long *temp);
  
  int get_tz_trend(struct thermal_zone_device *, int);
  struct thermal_instance *get_thermal_instance(struct thermal_zone_device *,
                 struct thermal_cooling_device *, int);
  void thermal_cdev_update(struct thermal_cooling_device *);
-void notify_thermal_framework(struct thermal_zone_device *, int);
-
-int thermal_register_governor(struct thermal_governor *);
-void thermal_unregister_governor(struct thermal_governor *);
+void thermal_notify_framework(struct thermal_zone_device *, int);
  
  #ifdef CONFIG_NET
  extern int thermal_generate_netlink_event(struct thermal_zone_device *tz,
                                                 enum events event);
  #else
-static int thermal_generate_netlink_event(struct thermal_zone_device *tz,
+static inline int thermal_generate_netlink_event(struct thermal_zone_device *tz,
                                                 enum events event)
  {
         return 0;
diff --git a/include/linux/usb/usbnet.h b/include/linux/usb/usbnet.h

index da46327fca173044b186943f8c7a044378d186cd..f18d64129f99982f966f3c2fe733cd85260dc625 100644 (file)
--- a/include/linux/usb/usbnet.h
+++ b/include/linux/usb/usbnet.h
@@ -56,6 +56,8 @@ struct usbnet {
         struct sk_buff_head     done;
         struct sk_buff_head     rxq_pause;
         struct urb              *interrupt;
+       unsigned                interrupt_count;
+       struct mutex            interrupt_mutex;
         struct usb_anchor       deferred;
         struct tasklet_struct   bh;
  
@@ -248,4 +250,7 @@ extern int usbnet_nway_reset(struct net_device *net);
  extern int usbnet_manage_power(struct usbnet *, int);
  extern void usbnet_link_change(struct usbnet *, bool, bool);
  
+extern int usbnet_status_start(struct usbnet *dev, gfp_t mem_flags);
+extern void usbnet_status_stop(struct usbnet *dev);
+
  #endif /* __LINUX_USB_USBNET_H */
diff --git a/include/linux/wait.h b/include/linux/wait.h

index 7cb64d4b499d21263feba5f3abf09bd600c5f8dc..ac38be2692d89f2993381e57d6145ab75891b1d3 100644 (file)
--- a/include/linux/wait.h
+++ b/include/linux/wait.h
@@ -330,6 +330,92 @@ do {                                                                       \
         __ret;                                                          \
  })
  
+#define __wait_event_hrtimeout(wq, condition, timeout, state)          \
+({                                                                     \
+       int __ret = 0;                                                  \
+       DEFINE_WAIT(__wait);                                            \
+       struct hrtimer_sleeper __t;                                     \
+                                                                       \
+       hrtimer_init_on_stack(&__t.timer, CLOCK_MONOTONIC,              \
+                             HRTIMER_MODE_REL);                        \
+       hrtimer_init_sleeper(&__t, current);                            \
+       if ((timeout).tv64 != KTIME_MAX)                                \
+               hrtimer_start_range_ns(&__t.timer, timeout,             \
+                                      current->timer_slack_ns,         \
+                                      HRTIMER_MODE_REL);               \
+                                                                       \
+       for (;;) {                                                      \
+               prepare_to_wait(&wq, &__wait, state);                   \
+               if (condition)                                          \
+                       break;                                          \
+               if (state == TASK_INTERRUPTIBLE &&                      \
+                   signal_pending(current)) {                          \
+                       __ret = -ERESTARTSYS;                           \
+                       break;                                          \
+               }                                                       \
+               if (!__t.task) {                                        \
+                       __ret = -ETIME;                                 \
+                       break;                                          \
+               }                                                       \
+               schedule();                                             \
+       }                                                               \
+                                                                       \
+       hrtimer_cancel(&__t.timer);                                     \
+       destroy_hrtimer_on_stack(&__t.timer);                           \
+       finish_wait(&wq, &__wait);                                      \
+       __ret;                                                          \
+})
+
+/**
+ * wait_event_hrtimeout - sleep until a condition gets true or a timeout elapses
+ * @wq: the waitqueue to wait on
+ * @condition: a C expression for the event to wait for
+ * @timeout: timeout, as a ktime_t
+ *
+ * The process is put to sleep (TASK_UNINTERRUPTIBLE) until the
+ * @condition evaluates to true or a signal is received.
+ * The @condition is checked each time the waitqueue @wq is woken up.
+ *
+ * wake_up() has to be called after changing any variable that could
+ * change the result of the wait condition.
+ *
+ * The function returns 0 if @condition became true, or -ETIME if the timeout
+ * elapsed.
+ */
+#define wait_event_hrtimeout(wq, condition, timeout)                   \
+({                                                                     \
+       int __ret = 0;                                                  \
+       if (!(condition))                                               \
+               __ret = __wait_event_hrtimeout(wq, condition, timeout,  \
+                                              TASK_UNINTERRUPTIBLE);   \
+       __ret;                                                          \
+})
+
+/**
+ * wait_event_interruptible_hrtimeout - sleep until a condition gets true or a timeout elapses
+ * @wq: the waitqueue to wait on
+ * @condition: a C expression for the event to wait for
+ * @timeout: timeout, as a ktime_t
+ *
+ * The process is put to sleep (TASK_INTERRUPTIBLE) until the
+ * @condition evaluates to true or a signal is received.
+ * The @condition is checked each time the waitqueue @wq is woken up.
+ *
+ * wake_up() has to be called after changing any variable that could
+ * change the result of the wait condition.
+ *
+ * The function returns 0 if @condition became true, -ERESTARTSYS if it was
+ * interrupted by a signal, or -ETIME if the timeout elapsed.
+ */
+#define wait_event_interruptible_hrtimeout(wq, condition, timeout)     \
+({                                                                     \
+       long __ret = 0;                                                 \
+       if (!(condition))                                               \
+               __ret = __wait_event_hrtimeout(wq, condition, timeout,  \
+                                              TASK_INTERRUPTIBLE);     \
+       __ret;                                                          \
+})
+
  #define __wait_event_interruptible_exclusive(wq, condition, ret)       \
  do {                                                                   \
         DEFINE_WAIT(__wait);                                            \
diff --git a/include/linux/writeback.h b/include/linux/writeback.h

index 9a9367c0c0768bff86bfa9ddde5ff85574497292..579a5007c696fc5b9fd95cd326470b70aa096456 100644 (file)
--- a/include/linux/writeback.h
+++ b/include/linux/writeback.h
@@ -5,6 +5,7 @@
  #define WRITEBACK_H
  
  #include <linux/sched.h>
+#include <linux/workqueue.h>
  #include <linux/fs.h>
  
  DECLARE_PER_CPU(int, dirty_throttle_leaks);
diff --git a/include/trace/events/bcache.h b/include/trace/events/bcache.h

new file mode 100644 (file)

index 0000000..3cc5a0b
--- /dev/null
+++ b/include/trace/events/bcache.h
@@ -0,0 +1,271 @@
+#undef TRACE_SYSTEM
+#define TRACE_SYSTEM bcache
+
+#if !defined(_TRACE_BCACHE_H) || defined(TRACE_HEADER_MULTI_READ)
+#define _TRACE_BCACHE_H
+
+#include <linux/tracepoint.h>
+
+struct search;
+
+DECLARE_EVENT_CLASS(bcache_request,
+
+       TP_PROTO(struct search *s, struct bio *bio),
+
+       TP_ARGS(s, bio),
+
+       TP_STRUCT__entry(
+               __field(dev_t,          dev                     )
+               __field(unsigned int,   orig_major              )
+               __field(unsigned int,   orig_minor              )
+               __field(sector_t,       sector                  )
+               __field(dev_t,          orig_sector             )
+               __field(unsigned int,   nr_sector               )
+               __array(char,           rwbs,   6               )
+               __array(char,           comm,   TASK_COMM_LEN   )
+       ),
+
+       TP_fast_assign(
+               __entry->dev            = bio->bi_bdev->bd_dev;
+               __entry->orig_major     = s->d->disk->major;
+               __entry->orig_minor     = s->d->disk->first_minor;
+               __entry->sector         = bio->bi_sector;
+               __entry->orig_sector    = bio->bi_sector - 16;
+               __entry->nr_sector      = bio->bi_size >> 9;
+               blk_fill_rwbs(__entry->rwbs, bio->bi_rw, bio->bi_size);
+               memcpy(__entry->comm, current->comm, TASK_COMM_LEN);
+       ),
+
+       TP_printk("%d,%d %s %llu + %u [%s] (from %d,%d @ %llu)",
+                 MAJOR(__entry->dev), MINOR(__entry->dev),
+                 __entry->rwbs,
+                 (unsigned long long)__entry->sector,
+                 __entry->nr_sector, __entry->comm,
+                 __entry->orig_major, __entry->orig_minor,
+                 (unsigned long long)__entry->orig_sector)
+);
+
+DEFINE_EVENT(bcache_request, bcache_request_start,
+
+       TP_PROTO(struct search *s, struct bio *bio),
+
+       TP_ARGS(s, bio)
+);
+
+DEFINE_EVENT(bcache_request, bcache_request_end,
+
+       TP_PROTO(struct search *s, struct bio *bio),
+
+       TP_ARGS(s, bio)
+);
+
+DECLARE_EVENT_CLASS(bcache_bio,
+
+       TP_PROTO(struct bio *bio),
+
+       TP_ARGS(bio),
+
+       TP_STRUCT__entry(
+               __field(dev_t,          dev                     )
+               __field(sector_t,       sector                  )
+               __field(unsigned int,   nr_sector               )
+               __array(char,           rwbs,   6               )
+               __array(char,           comm,   TASK_COMM_LEN   )
+       ),
+
+       TP_fast_assign(
+               __entry->dev            = bio->bi_bdev->bd_dev;
+               __entry->sector         = bio->bi_sector;
+               __entry->nr_sector      = bio->bi_size >> 9;
+               blk_fill_rwbs(__entry->rwbs, bio->bi_rw, bio->bi_size);
+               memcpy(__entry->comm, current->comm, TASK_COMM_LEN);
+       ),
+
+       TP_printk("%d,%d  %s %llu + %u [%s]",
+                 MAJOR(__entry->dev), MINOR(__entry->dev),
+                 __entry->rwbs,
+                 (unsigned long long)__entry->sector,
+                 __entry->nr_sector, __entry->comm)
+);
+
+
+DEFINE_EVENT(bcache_bio, bcache_passthrough,
+
+       TP_PROTO(struct bio *bio),
+
+       TP_ARGS(bio)
+);
+
+DEFINE_EVENT(bcache_bio, bcache_cache_hit,
+
+       TP_PROTO(struct bio *bio),
+
+       TP_ARGS(bio)
+);
+
+DEFINE_EVENT(bcache_bio, bcache_cache_miss,
+
+       TP_PROTO(struct bio *bio),
+
+       TP_ARGS(bio)
+);
+
+DEFINE_EVENT(bcache_bio, bcache_read_retry,
+
+       TP_PROTO(struct bio *bio),
+
+       TP_ARGS(bio)
+);
+
+DEFINE_EVENT(bcache_bio, bcache_writethrough,
+
+       TP_PROTO(struct bio *bio),
+
+       TP_ARGS(bio)
+);
+
+DEFINE_EVENT(bcache_bio, bcache_writeback,
+
+       TP_PROTO(struct bio *bio),
+
+       TP_ARGS(bio)
+);
+
+DEFINE_EVENT(bcache_bio, bcache_write_skip,
+
+       TP_PROTO(struct bio *bio),
+
+       TP_ARGS(bio)
+);
+
+DEFINE_EVENT(bcache_bio, bcache_btree_read,
+
+       TP_PROTO(struct bio *bio),
+
+       TP_ARGS(bio)
+);
+
+DEFINE_EVENT(bcache_bio, bcache_btree_write,
+
+       TP_PROTO(struct bio *bio),
+
+       TP_ARGS(bio)
+);
+
+DEFINE_EVENT(bcache_bio, bcache_write_dirty,
+
+       TP_PROTO(struct bio *bio),
+
+       TP_ARGS(bio)
+);
+
+DEFINE_EVENT(bcache_bio, bcache_read_dirty,
+
+       TP_PROTO(struct bio *bio),
+
+       TP_ARGS(bio)
+);
+
+DEFINE_EVENT(bcache_bio, bcache_write_moving,
+
+       TP_PROTO(struct bio *bio),
+
+       TP_ARGS(bio)
+);
+
+DEFINE_EVENT(bcache_bio, bcache_read_moving,
+
+       TP_PROTO(struct bio *bio),
+
+       TP_ARGS(bio)
+);
+
+DEFINE_EVENT(bcache_bio, bcache_journal_write,
+
+       TP_PROTO(struct bio *bio),
+
+       TP_ARGS(bio)
+);
+
+DECLARE_EVENT_CLASS(bcache_cache_bio,
+
+       TP_PROTO(struct bio *bio,
+                sector_t orig_sector,
+                struct block_device* orig_bdev),
+
+       TP_ARGS(bio, orig_sector, orig_bdev),
+
+       TP_STRUCT__entry(
+               __field(dev_t,          dev                     )
+               __field(dev_t,          orig_dev                )
+               __field(sector_t,       sector                  )
+               __field(sector_t,       orig_sector             )
+               __field(unsigned int,   nr_sector               )
+               __array(char,           rwbs,   6               )
+               __array(char,           comm,   TASK_COMM_LEN   )
+       ),
+
+       TP_fast_assign(
+               __entry->dev            = bio->bi_bdev->bd_dev;
+               __entry->orig_dev       = orig_bdev->bd_dev;
+               __entry->sector         = bio->bi_sector;
+               __entry->orig_sector    = orig_sector;
+               __entry->nr_sector      = bio->bi_size >> 9;
+               blk_fill_rwbs(__entry->rwbs, bio->bi_rw, bio->bi_size);
+               memcpy(__entry->comm, current->comm, TASK_COMM_LEN);
+       ),
+
+       TP_printk("%d,%d  %s %llu + %u [%s] (from %d,%d %llu)",
+                 MAJOR(__entry->dev), MINOR(__entry->dev),
+                 __entry->rwbs,
+                 (unsigned long long)__entry->sector,
+                 __entry->nr_sector, __entry->comm,
+                 MAJOR(__entry->orig_dev), MINOR(__entry->orig_dev),
+                 (unsigned long long)__entry->orig_sector)
+);
+
+DEFINE_EVENT(bcache_cache_bio, bcache_cache_insert,
+
+       TP_PROTO(struct bio *bio,
+                sector_t orig_sector,
+                struct block_device *orig_bdev),
+
+       TP_ARGS(bio, orig_sector, orig_bdev)
+);
+
+DECLARE_EVENT_CLASS(bcache_gc,
+
+       TP_PROTO(uint8_t *uuid),
+
+       TP_ARGS(uuid),
+
+       TP_STRUCT__entry(
+               __field(uint8_t *,      uuid)
+       ),
+
+       TP_fast_assign(
+               __entry->uuid           = uuid;
+       ),
+
+       TP_printk("%pU", __entry->uuid)
+);
+
+
+DEFINE_EVENT(bcache_gc, bcache_gc_start,
+
+            TP_PROTO(uint8_t *uuid),
+
+            TP_ARGS(uuid)
+);
+
+DEFINE_EVENT(bcache_gc, bcache_gc_end,
+
+            TP_PROTO(uint8_t *uuid),
+
+            TP_ARGS(uuid)
+);
+
+#endif /* _TRACE_BCACHE_H */
+
+/* This part must be outside protection */
+#include <trace/define_trace.h>
diff --git a/include/trace/events/block.h b/include/trace/events/block.h

index 9c1467357b03c616967cd193efab6506e3e5adff..60ae7c3db912de7e068452de1a1c1978cad0a662 100644 (file)
--- a/include/trace/events/block.h
+++ b/include/trace/events/block.h
@@ -244,7 +244,7 @@ TRACE_EVENT(block_bio_bounce,
                 __entry->dev            = bio->bi_bdev ?
                                           bio->bi_bdev->bd_dev : 0;
                 __entry->sector         = bio->bi_sector;
-               __entry->nr_sector      = bio->bi_size >> 9;
+               __entry->nr_sector      = bio_sectors(bio);
                 blk_fill_rwbs(__entry->rwbs, bio->bi_rw, bio->bi_size);
                 memcpy(__entry->comm, current->comm, TASK_COMM_LEN);
         ),
@@ -281,7 +281,7 @@ TRACE_EVENT(block_bio_complete,
         TP_fast_assign(
                 __entry->dev            = bio->bi_bdev->bd_dev;
                 __entry->sector         = bio->bi_sector;
-               __entry->nr_sector      = bio->bi_size >> 9;
+               __entry->nr_sector      = bio_sectors(bio);
                 __entry->error          = error;
                 blk_fill_rwbs(__entry->rwbs, bio->bi_rw, bio->bi_size);
         ),
@@ -309,7 +309,7 @@ DECLARE_EVENT_CLASS(block_bio_merge,
         TP_fast_assign(
                 __entry->dev            = bio->bi_bdev->bd_dev;
                 __entry->sector         = bio->bi_sector;
-               __entry->nr_sector      = bio->bi_size >> 9;
+               __entry->nr_sector      = bio_sectors(bio);
                 blk_fill_rwbs(__entry->rwbs, bio->bi_rw, bio->bi_size);
                 memcpy(__entry->comm, current->comm, TASK_COMM_LEN);
         ),
@@ -376,7 +376,7 @@ TRACE_EVENT(block_bio_queue,
         TP_fast_assign(
                 __entry->dev            = bio->bi_bdev->bd_dev;
                 __entry->sector         = bio->bi_sector;
-               __entry->nr_sector      = bio->bi_size >> 9;
+               __entry->nr_sector      = bio_sectors(bio);
                 blk_fill_rwbs(__entry->rwbs, bio->bi_rw, bio->bi_size);
                 memcpy(__entry->comm, current->comm, TASK_COMM_LEN);
         ),
@@ -404,7 +404,7 @@ DECLARE_EVENT_CLASS(block_get_rq,
         TP_fast_assign(
                 __entry->dev            = bio ? bio->bi_bdev->bd_dev : 0;
                 __entry->sector         = bio ? bio->bi_sector : 0;
-               __entry->nr_sector      = bio ? bio->bi_size >> 9 : 0;
+               __entry->nr_sector      = bio ? bio_sectors(bio) : 0;
                 blk_fill_rwbs(__entry->rwbs,
                               bio ? bio->bi_rw : 0, __entry->nr_sector);
                 memcpy(__entry->comm, current->comm, TASK_COMM_LEN);
@@ -580,7 +580,7 @@ TRACE_EVENT(block_bio_remap,
         TP_fast_assign(
                 __entry->dev            = bio->bi_bdev->bd_dev;
                 __entry->sector         = bio->bi_sector;
-               __entry->nr_sector      = bio->bi_size >> 9;
+               __entry->nr_sector      = bio_sectors(bio);
                 __entry->old_dev        = dev;
                 __entry->old_sector     = from;
                 blk_fill_rwbs(__entry->rwbs, bio->bi_rw, bio->bi_size);
diff --git a/include/trace/events/f2fs.h b/include/trace/events/f2fs.h

new file mode 100644 (file)

index 0000000..52ae548
--- /dev/null
+++ b/include/trace/events/f2fs.h
@@ -0,0 +1,682 @@
+#undef TRACE_SYSTEM
+#define TRACE_SYSTEM f2fs
+
+#if !defined(_TRACE_F2FS_H) || defined(TRACE_HEADER_MULTI_READ)
+#define _TRACE_F2FS_H
+
+#include <linux/tracepoint.h>
+
+#define show_dev(entry)                MAJOR(entry->dev), MINOR(entry->dev)
+#define show_dev_ino(entry)    show_dev(entry), (unsigned long)entry->ino
+
+#define show_block_type(type)                                          \
+       __print_symbolic(type,                                          \
+               { NODE,         "NODE" },                               \
+               { DATA,         "DATA" },                               \
+               { META,         "META" },                               \
+               { META_FLUSH,   "META_FLUSH" })
+
+#define show_bio_type(type)                                            \
+       __print_symbolic(type,                                          \
+               { READ,         "READ" },                               \
+               { READA,        "READAHEAD" },                          \
+               { READ_SYNC,    "READ_SYNC" },                          \
+               { WRITE,        "WRITE" },                              \
+               { WRITE_SYNC,   "WRITE_SYNC" },                         \
+               { WRITE_FLUSH,  "WRITE_FLUSH" },                        \
+               { WRITE_FUA,    "WRITE_FUA" })
+
+#define show_data_type(type)                                           \
+       __print_symbolic(type,                                          \
+               { CURSEG_HOT_DATA,      "Hot DATA" },                   \
+               { CURSEG_WARM_DATA,     "Warm DATA" },                  \
+               { CURSEG_COLD_DATA,     "Cold DATA" },                  \
+               { CURSEG_HOT_NODE,      "Hot NODE" },                   \
+               { CURSEG_WARM_NODE,     "Warm NODE" },                  \
+               { CURSEG_COLD_NODE,     "Cold NODE" },                  \
+               { NO_CHECK_TYPE,        "No TYPE" })
+
+#define show_gc_type(type)                                             \
+       __print_symbolic(type,                                          \
+               { FG_GC,        "Foreground GC" },                      \
+               { BG_GC,        "Background GC" })
+
+#define show_alloc_mode(type)                                          \
+       __print_symbolic(type,                                          \
+               { LFS,  "LFS-mode" },                                   \
+               { SSR,  "SSR-mode" })
+
+#define show_victim_policy(type)                                       \
+       __print_symbolic(type,                                          \
+               { GC_GREEDY,    "Greedy" },                             \
+               { GC_CB,        "Cost-Benefit" })
+
+struct victim_sel_policy;
+
+DECLARE_EVENT_CLASS(f2fs__inode,
+
+       TP_PROTO(struct inode *inode),
+
+       TP_ARGS(inode),
+
+       TP_STRUCT__entry(
+               __field(dev_t,  dev)
+               __field(ino_t,  ino)
+               __field(ino_t,  pino)
+               __field(umode_t, mode)
+               __field(loff_t, size)
+               __field(unsigned int, nlink)
+               __field(blkcnt_t, blocks)
+               __field(__u8,   advise)
+       ),
+
+       TP_fast_assign(
+               __entry->dev    = inode->i_sb->s_dev;
+               __entry->ino    = inode->i_ino;
+               __entry->pino   = F2FS_I(inode)->i_pino;
+               __entry->mode   = inode->i_mode;
+               __entry->nlink  = inode->i_nlink;
+               __entry->size   = inode->i_size;
+               __entry->blocks = inode->i_blocks;
+               __entry->advise = F2FS_I(inode)->i_advise;
+       ),
+
+       TP_printk("dev = (%d,%d), ino = %lu, pino = %lu, i_mode = 0x%hx, "
+               "i_size = %lld, i_nlink = %u, i_blocks = %llu, i_advise = 0x%x",
+               show_dev_ino(__entry),
+               (unsigned long)__entry->pino,
+               __entry->mode,
+               __entry->size,
+               (unsigned int)__entry->nlink,
+               (unsigned long long)__entry->blocks,
+               (unsigned char)__entry->advise)
+);
+
+DECLARE_EVENT_CLASS(f2fs__inode_exit,
+
+       TP_PROTO(struct inode *inode, int ret),
+
+       TP_ARGS(inode, ret),
+
+       TP_STRUCT__entry(
+               __field(dev_t,  dev)
+               __field(ino_t,  ino)
+               __field(int,    ret)
+       ),
+
+       TP_fast_assign(
+               __entry->dev    = inode->i_sb->s_dev;
+               __entry->ino    = inode->i_ino;
+               __entry->ret    = ret;
+       ),
+
+       TP_printk("dev = (%d,%d), ino = %lu, ret = %d",
+               show_dev_ino(__entry),
+               __entry->ret)
+);
+
+DEFINE_EVENT(f2fs__inode, f2fs_sync_file_enter,
+
+       TP_PROTO(struct inode *inode),
+
+       TP_ARGS(inode)
+);
+
+TRACE_EVENT(f2fs_sync_file_exit,
+
+       TP_PROTO(struct inode *inode, bool need_cp, int datasync, int ret),
+
+       TP_ARGS(inode, need_cp, datasync, ret),
+
+       TP_STRUCT__entry(
+               __field(dev_t,  dev)
+               __field(ino_t,  ino)
+               __field(bool,   need_cp)
+               __field(int,    datasync)
+               __field(int,    ret)
+       ),
+
+       TP_fast_assign(
+               __entry->dev            = inode->i_sb->s_dev;
+               __entry->ino            = inode->i_ino;
+               __entry->need_cp        = need_cp;
+               __entry->datasync       = datasync;
+               __entry->ret            = ret;
+       ),
+
+       TP_printk("dev = (%d,%d), ino = %lu, checkpoint is %s, "
+               "datasync = %d, ret = %d",
+               show_dev_ino(__entry),
+               __entry->need_cp ? "needed" : "not needed",
+               __entry->datasync,
+               __entry->ret)
+);
+
+TRACE_EVENT(f2fs_sync_fs,
+
+       TP_PROTO(struct super_block *sb, int wait),
+
+       TP_ARGS(sb, wait),
+
+       TP_STRUCT__entry(
+               __field(dev_t,  dev)
+               __field(int,    dirty)
+               __field(int,    wait)
+       ),
+
+       TP_fast_assign(
+               __entry->dev    = sb->s_dev;
+               __entry->dirty  = F2FS_SB(sb)->s_dirty;
+               __entry->wait   = wait;
+       ),
+
+       TP_printk("dev = (%d,%d), superblock is %s, wait = %d",
+               show_dev(__entry),
+               __entry->dirty ? "dirty" : "not dirty",
+               __entry->wait)
+);
+
+DEFINE_EVENT(f2fs__inode, f2fs_iget,
+
+       TP_PROTO(struct inode *inode),
+
+       TP_ARGS(inode)
+);
+
+DEFINE_EVENT(f2fs__inode_exit, f2fs_iget_exit,
+
+       TP_PROTO(struct inode *inode, int ret),
+
+       TP_ARGS(inode, ret)
+);
+
+DEFINE_EVENT(f2fs__inode, f2fs_evict_inode,
+
+       TP_PROTO(struct inode *inode),
+
+       TP_ARGS(inode)
+);
+
+DEFINE_EVENT(f2fs__inode_exit, f2fs_new_inode,
+
+       TP_PROTO(struct inode *inode, int ret),
+
+       TP_ARGS(inode, ret)
+);
+
+TRACE_EVENT(f2fs_unlink_enter,
+
+       TP_PROTO(struct inode *dir, struct dentry *dentry),
+
+       TP_ARGS(dir, dentry),
+
+       TP_STRUCT__entry(
+               __field(dev_t,  dev)
+               __field(ino_t,  ino)
+               __field(loff_t, size)
+               __field(blkcnt_t, blocks)
+               __field(const char *,   name)
+       ),
+
+       TP_fast_assign(
+               __entry->dev    = dir->i_sb->s_dev;
+               __entry->ino    = dir->i_ino;
+               __entry->size   = dir->i_size;
+               __entry->blocks = dir->i_blocks;
+               __entry->name   = dentry->d_name.name;
+       ),
+
+       TP_printk("dev = (%d,%d), dir ino = %lu, i_size = %lld, "
+               "i_blocks = %llu, name = %s",
+               show_dev_ino(__entry),
+               __entry->size,
+               (unsigned long long)__entry->blocks,
+               __entry->name)
+);
+
+DEFINE_EVENT(f2fs__inode_exit, f2fs_unlink_exit,
+
+       TP_PROTO(struct inode *inode, int ret),
+
+       TP_ARGS(inode, ret)
+);
+
+DEFINE_EVENT(f2fs__inode, f2fs_truncate,
+
+       TP_PROTO(struct inode *inode),
+
+       TP_ARGS(inode)
+);
+
+TRACE_EVENT(f2fs_truncate_data_blocks_range,
+
+       TP_PROTO(struct inode *inode, nid_t nid, unsigned int ofs, int free),
+
+       TP_ARGS(inode, nid,  ofs, free),
+
+       TP_STRUCT__entry(
+               __field(dev_t,  dev)
+               __field(ino_t,  ino)
+               __field(nid_t,  nid)
+               __field(unsigned int,   ofs)
+               __field(int,    free)
+       ),
+
+       TP_fast_assign(
+               __entry->dev    = inode->i_sb->s_dev;
+               __entry->ino    = inode->i_ino;
+               __entry->nid    = nid;
+               __entry->ofs    = ofs;
+               __entry->free   = free;
+       ),
+
+       TP_printk("dev = (%d,%d), ino = %lu, nid = %u, offset = %u, freed = %d",
+               show_dev_ino(__entry),
+               (unsigned int)__entry->nid,
+               __entry->ofs,
+               __entry->free)
+);
+
+DECLARE_EVENT_CLASS(f2fs__truncate_op,
+
+       TP_PROTO(struct inode *inode, u64 from),
+
+       TP_ARGS(inode, from),
+
+       TP_STRUCT__entry(
+               __field(dev_t,  dev)
+               __field(ino_t,  ino)
+               __field(loff_t, size)
+               __field(blkcnt_t, blocks)
+               __field(u64,    from)
+       ),
+
+       TP_fast_assign(
+               __entry->dev    = inode->i_sb->s_dev;
+               __entry->ino    = inode->i_ino;
+               __entry->size   = inode->i_size;
+               __entry->blocks = inode->i_blocks;
+               __entry->from   = from;
+       ),
+
+       TP_printk("dev = (%d,%d), ino = %lu, i_size = %lld, i_blocks = %llu, "
+               "start file offset = %llu",
+               show_dev_ino(__entry),
+               __entry->size,
+               (unsigned long long)__entry->blocks,
+               (unsigned long long)__entry->from)
+);
+
+DEFINE_EVENT(f2fs__truncate_op, f2fs_truncate_blocks_enter,
+
+       TP_PROTO(struct inode *inode, u64 from),
+
+       TP_ARGS(inode, from)
+);
+
+DEFINE_EVENT(f2fs__inode_exit, f2fs_truncate_blocks_exit,
+
+       TP_PROTO(struct inode *inode, int ret),
+
+       TP_ARGS(inode, ret)
+);
+
+DEFINE_EVENT(f2fs__truncate_op, f2fs_truncate_inode_blocks_enter,
+
+       TP_PROTO(struct inode *inode, u64 from),
+
+       TP_ARGS(inode, from)
+);
+
+DEFINE_EVENT(f2fs__inode_exit, f2fs_truncate_inode_blocks_exit,
+
+       TP_PROTO(struct inode *inode, int ret),
+
+       TP_ARGS(inode, ret)
+);
+
+DECLARE_EVENT_CLASS(f2fs__truncate_node,
+
+       TP_PROTO(struct inode *inode, nid_t nid, block_t blk_addr),
+
+       TP_ARGS(inode, nid, blk_addr),
+
+       TP_STRUCT__entry(
+               __field(dev_t,  dev)
+               __field(ino_t,  ino)
+               __field(nid_t,  nid)
+               __field(block_t,        blk_addr)
+       ),
+
+       TP_fast_assign(
+               __entry->dev            = inode->i_sb->s_dev;
+               __entry->ino            = inode->i_ino;
+               __entry->nid            = nid;
+               __entry->blk_addr       = blk_addr;
+       ),
+
+       TP_printk("dev = (%d,%d), ino = %lu, nid = %u, block_address = 0x%llx",
+               show_dev_ino(__entry),
+               (unsigned int)__entry->nid,
+               (unsigned long long)__entry->blk_addr)
+);
+
+DEFINE_EVENT(f2fs__truncate_node, f2fs_truncate_nodes_enter,
+
+       TP_PROTO(struct inode *inode, nid_t nid, block_t blk_addr),
+
+       TP_ARGS(inode, nid, blk_addr)
+);
+
+DEFINE_EVENT(f2fs__inode_exit, f2fs_truncate_nodes_exit,
+
+       TP_PROTO(struct inode *inode, int ret),
+
+       TP_ARGS(inode, ret)
+);
+
+DEFINE_EVENT(f2fs__truncate_node, f2fs_truncate_node,
+
+       TP_PROTO(struct inode *inode, nid_t nid, block_t blk_addr),
+
+       TP_ARGS(inode, nid, blk_addr)
+);
+
+TRACE_EVENT(f2fs_truncate_partial_nodes,
+
+       TP_PROTO(struct inode *inode, nid_t nid[], int depth, int err),
+
+       TP_ARGS(inode, nid, depth, err),
+
+       TP_STRUCT__entry(
+               __field(dev_t,  dev)
+               __field(ino_t,  ino)
+               __field(nid_t,  nid[3])
+               __field(int,    depth)
+               __field(int,    err)
+       ),
+
+       TP_fast_assign(
+               __entry->dev    = inode->i_sb->s_dev;
+               __entry->ino    = inode->i_ino;
+               __entry->nid[0] = nid[0];
+               __entry->nid[1] = nid[1];
+               __entry->nid[2] = nid[2];
+               __entry->depth  = depth;
+               __entry->err    = err;
+       ),
+
+       TP_printk("dev = (%d,%d), ino = %lu, "
+               "nid[0] = %u, nid[1] = %u, nid[2] = %u, depth = %d, err = %d",
+               show_dev_ino(__entry),
+               (unsigned int)__entry->nid[0],
+               (unsigned int)__entry->nid[1],
+               (unsigned int)__entry->nid[2],
+               __entry->depth,
+               __entry->err)
+);
+
+TRACE_EVENT_CONDITION(f2fs_readpage,
+
+       TP_PROTO(struct page *page, sector_t blkaddr, int type),
+
+       TP_ARGS(page, blkaddr, type),
+
+       TP_CONDITION(page->mapping),
+
+       TP_STRUCT__entry(
+               __field(dev_t,  dev)
+               __field(ino_t,  ino)
+               __field(pgoff_t,        index)
+               __field(sector_t,       blkaddr)
+               __field(int,    type)
+       ),
+
+       TP_fast_assign(
+               __entry->dev            = page->mapping->host->i_sb->s_dev;
+               __entry->ino            = page->mapping->host->i_ino;
+               __entry->index          = page->index;
+               __entry->blkaddr        = blkaddr;
+               __entry->type           = type;
+       ),
+
+       TP_printk("dev = (%d,%d), ino = %lu, page_index = 0x%lx, "
+               "blkaddr = 0x%llx, bio_type = %s",
+               show_dev_ino(__entry),
+               (unsigned long)__entry->index,
+               (unsigned long long)__entry->blkaddr,
+               show_bio_type(__entry->type))
+);
+
+TRACE_EVENT(f2fs_get_data_block,
+       TP_PROTO(struct inode *inode, sector_t iblock,
+                               struct buffer_head *bh, int ret),
+
+       TP_ARGS(inode, iblock, bh, ret),
+
+       TP_STRUCT__entry(
+               __field(dev_t,  dev)
+               __field(ino_t,  ino)
+               __field(sector_t,       iblock)
+               __field(sector_t,       bh_start)
+               __field(size_t, bh_size)
+               __field(int,    ret)
+       ),
+
+       TP_fast_assign(
+               __entry->dev            = inode->i_sb->s_dev;
+               __entry->ino            = inode->i_ino;
+               __entry->iblock         = iblock;
+               __entry->bh_start       = bh->b_blocknr;
+               __entry->bh_size        = bh->b_size;
+               __entry->ret            = ret;
+       ),
+
+       TP_printk("dev = (%d,%d), ino = %lu, file offset = %llu, "
+               "start blkaddr = 0x%llx, len = 0x%llx bytes, err = %d",
+               show_dev_ino(__entry),
+               (unsigned long long)__entry->iblock,
+               (unsigned long long)__entry->bh_start,
+               (unsigned long long)__entry->bh_size,
+               __entry->ret)
+);
+
+TRACE_EVENT(f2fs_get_victim,
+
+       TP_PROTO(struct super_block *sb, int type, int gc_type,
+                       struct victim_sel_policy *p, unsigned int pre_victim,
+                       unsigned int prefree, unsigned int free),
+
+       TP_ARGS(sb, type, gc_type, p, pre_victim, prefree, free),
+
+       TP_STRUCT__entry(
+               __field(dev_t,  dev)
+               __field(int,    type)
+               __field(int,    gc_type)
+               __field(int,    alloc_mode)
+               __field(int,    gc_mode)
+               __field(unsigned int,   victim)
+               __field(unsigned int,   ofs_unit)
+               __field(unsigned int,   pre_victim)
+               __field(unsigned int,   prefree)
+               __field(unsigned int,   free)
+       ),
+
+       TP_fast_assign(
+               __entry->dev            = sb->s_dev;
+               __entry->type           = type;
+               __entry->gc_type        = gc_type;
+               __entry->alloc_mode     = p->alloc_mode;
+               __entry->gc_mode        = p->gc_mode;
+               __entry->victim         = p->min_segno;
+               __entry->ofs_unit       = p->ofs_unit;
+               __entry->pre_victim     = pre_victim;
+               __entry->prefree        = prefree;
+               __entry->free           = free;
+       ),
+
+       TP_printk("dev = (%d,%d), type = %s, policy = (%s, %s, %s), victim = %u "
+               "ofs_unit = %u, pre_victim_secno = %d, prefree = %u, free = %u",
+               show_dev(__entry),
+               show_data_type(__entry->type),
+               show_gc_type(__entry->gc_type),
+               show_alloc_mode(__entry->alloc_mode),
+               show_victim_policy(__entry->gc_mode),
+               __entry->victim,
+               __entry->ofs_unit,
+               (int)__entry->pre_victim,
+               __entry->prefree,
+               __entry->free)
+);
+
+TRACE_EVENT(f2fs_fallocate,
+
+       TP_PROTO(struct inode *inode, int mode,
+                               loff_t offset, loff_t len, int ret),
+
+       TP_ARGS(inode, mode, offset, len, ret),
+
+       TP_STRUCT__entry(
+               __field(dev_t,  dev)
+               __field(ino_t,  ino)
+               __field(int,    mode)
+               __field(loff_t, offset)
+               __field(loff_t, len)
+               __field(loff_t, size)
+               __field(blkcnt_t, blocks)
+               __field(int,    ret)
+       ),
+
+       TP_fast_assign(
+               __entry->dev    = inode->i_sb->s_dev;
+               __entry->ino    = inode->i_ino;
+               __entry->mode   = mode;
+               __entry->offset = offset;
+               __entry->len    = len;
+               __entry->size   = inode->i_size;
+               __entry->blocks = inode->i_blocks;
+               __entry->ret    = ret;
+       ),
+
+       TP_printk("dev = (%d,%d), ino = %lu, mode = %x, offset = %lld, "
+               "len = %lld,  i_size = %lld, i_blocks = %llu, ret = %d",
+               show_dev_ino(__entry),
+               __entry->mode,
+               (unsigned long long)__entry->offset,
+               (unsigned long long)__entry->len,
+               (unsigned long long)__entry->size,
+               (unsigned long long)__entry->blocks,
+               __entry->ret)
+);
+
+TRACE_EVENT(f2fs_reserve_new_block,
+
+       TP_PROTO(struct inode *inode, nid_t nid, unsigned int ofs_in_node),
+
+       TP_ARGS(inode, nid, ofs_in_node),
+
+       TP_STRUCT__entry(
+               __field(dev_t,  dev)
+               __field(nid_t, nid)
+               __field(unsigned int, ofs_in_node)
+       ),
+
+       TP_fast_assign(
+               __entry->dev    = inode->i_sb->s_dev;
+               __entry->nid    = nid;
+               __entry->ofs_in_node = ofs_in_node;
+       ),
+
+       TP_printk("dev = (%d,%d), nid = %u, ofs_in_node = %u",
+               show_dev(__entry),
+               (unsigned int)__entry->nid,
+               __entry->ofs_in_node)
+);
+
+TRACE_EVENT(f2fs_do_submit_bio,
+
+       TP_PROTO(struct super_block *sb, int btype, bool sync, struct bio *bio),
+
+       TP_ARGS(sb, btype, sync, bio),
+
+       TP_STRUCT__entry(
+               __field(dev_t,  dev)
+               __field(int,    btype)
+               __field(bool,   sync)
+               __field(sector_t,       sector)
+               __field(unsigned int,   size)
+       ),
+
+       TP_fast_assign(
+               __entry->dev            = sb->s_dev;
+               __entry->btype          = btype;
+               __entry->sync           = sync;
+               __entry->sector         = bio->bi_sector;
+               __entry->size           = bio->bi_size;
+       ),
+
+       TP_printk("dev = (%d,%d), type = %s, io = %s, sector = %lld, size = %u",
+               show_dev(__entry),
+               show_block_type(__entry->btype),
+               __entry->sync ? "sync" : "no sync",
+               (unsigned long long)__entry->sector,
+               __entry->size)
+);
+
+TRACE_EVENT(f2fs_submit_write_page,
+
+       TP_PROTO(struct page *page, block_t blk_addr, int type),
+
+       TP_ARGS(page, blk_addr, type),
+
+       TP_STRUCT__entry(
+               __field(dev_t,  dev)
+               __field(ino_t,  ino)
+               __field(int, type)
+               __field(pgoff_t, index)
+               __field(block_t, block)
+       ),
+
+       TP_fast_assign(
+               __entry->dev    = page->mapping->host->i_sb->s_dev;
+               __entry->ino    = page->mapping->host->i_ino;
+               __entry->type   = type;
+               __entry->index  = page->index;
+               __entry->block  = blk_addr;
+       ),
+
+       TP_printk("dev = (%d,%d), ino = %lu, %s, index = %lu, blkaddr = 0x%llx",
+               show_dev_ino(__entry),
+               show_block_type(__entry->type),
+               (unsigned long)__entry->index,
+               (unsigned long long)__entry->block)
+);
+
+TRACE_EVENT(f2fs_write_checkpoint,
+
+       TP_PROTO(struct super_block *sb, bool is_umount, char *msg),
+
+       TP_ARGS(sb, is_umount, msg),
+
+       TP_STRUCT__entry(
+               __field(dev_t,  dev)
+               __field(bool,   is_umount)
+               __field(char *, msg)
+       ),
+
+       TP_fast_assign(
+               __entry->dev            = sb->s_dev;
+               __entry->is_umount      = is_umount;
+               __entry->msg            = msg;
+       ),
+
+       TP_printk("dev = (%d,%d), checkpoint for %s, state = %s",
+               show_dev(__entry),
+               __entry->is_umount ? "clean umount" : "consistency",
+               __entry->msg)
+);
+
+#endif /* _TRACE_F2FS_H */
+
+ /* This part must be outside protection */
+#include <trace/define_trace.h>
diff --git a/include/trace/events/writeback.h b/include/trace/events/writeback.h

index 6a16fd2e70ed27741ed13f80156d713d38127fb1..464ea82e10dbf1f1a519b75c52f9e129a5a715e0 100644 (file)
--- a/include/trace/events/writeback.h
+++ b/include/trace/events/writeback.h
@@ -183,7 +183,6 @@ DECLARE_EVENT_CLASS(writeback_work_class,
  DEFINE_EVENT(writeback_work_class, name, \
         TP_PROTO(struct backing_dev_info *bdi, struct wb_writeback_work *work), \
         TP_ARGS(bdi, work))
-DEFINE_WRITEBACK_WORK_EVENT(writeback_nothread);
  DEFINE_WRITEBACK_WORK_EVENT(writeback_queue);
  DEFINE_WRITEBACK_WORK_EVENT(writeback_exec);
  DEFINE_WRITEBACK_WORK_EVENT(writeback_start);
@@ -222,12 +221,8 @@ DEFINE_EVENT(writeback_class, name, \
  
  DEFINE_WRITEBACK_EVENT(writeback_nowork);
  DEFINE_WRITEBACK_EVENT(writeback_wake_background);
-DEFINE_WRITEBACK_EVENT(writeback_wake_thread);
-DEFINE_WRITEBACK_EVENT(writeback_wake_forker_thread);
  DEFINE_WRITEBACK_EVENT(writeback_bdi_register);
  DEFINE_WRITEBACK_EVENT(writeback_bdi_unregister);
-DEFINE_WRITEBACK_EVENT(writeback_thread_start);
-DEFINE_WRITEBACK_EVENT(writeback_thread_stop);
  
  DECLARE_EVENT_CLASS(wbc_class,
         TP_PROTO(struct writeback_control *wbc, struct backing_dev_info *bdi),
diff --git a/include/uapi/linux/if_cablemodem.h b/include/uapi/linux/if_cablemodem.h

index 9ca1007edd930632e5942d2626caba02943d032a..ee6b3c442baf008cbee2037720bbbca8e1e411d5 100644 (file)
--- a/include/uapi/linux/if_cablemodem.h
+++ b/include/uapi/linux/if_cablemodem.h
@@ -12,11 +12,11 @@
   */
  
  /* some useful defines for sb1000.c e cmconfig.c - fv */
-#define SIOCGCMSTATS           SIOCDEVPRIVATE+0        /* get cable modem stats */
-#define SIOCGCMFIRMWARE                SIOCDEVPRIVATE+1        /* get cm firmware version */
-#define SIOCGCMFREQUENCY       SIOCDEVPRIVATE+2        /* get cable modem frequency */
-#define SIOCSCMFREQUENCY       SIOCDEVPRIVATE+3        /* set cable modem frequency */
-#define SIOCGCMPIDS                    SIOCDEVPRIVATE+4        /* get cable modem PIDs */
-#define SIOCSCMPIDS                    SIOCDEVPRIVATE+5        /* set cable modem PIDs */
+#define SIOCGCMSTATS           (SIOCDEVPRIVATE+0)      /* get cable modem stats */
+#define SIOCGCMFIRMWARE                (SIOCDEVPRIVATE+1)      /* get cm firmware version */
+#define SIOCGCMFREQUENCY       (SIOCDEVPRIVATE+2)      /* get cable modem frequency */
+#define SIOCSCMFREQUENCY       (SIOCDEVPRIVATE+3)      /* set cable modem frequency */
+#define SIOCGCMPIDS                    (SIOCDEVPRIVATE+4)      /* get cable modem PIDs */
+#define SIOCSCMPIDS                    (SIOCDEVPRIVATE+5)      /* set cable modem PIDs */
  
  #endif
diff --git a/ipc/shm.c b/ipc/shm.c

index 8247c49ec073c236d183ead9f51ab1f4b3779337..34af1fe34701afd06678ea83a47649b1947a4a65 100644 (file)
--- a/ipc/shm.c
+++ b/ipc/shm.c
@@ -491,10 +491,14 @@ static int newseg(struct ipc_namespace *ns, struct ipc_params *params)
  
         sprintf (name, "SYSV%08x", key);
         if (shmflg & SHM_HUGETLB) {
+               struct hstate *hs = hstate_sizelog((shmflg >> SHM_HUGE_SHIFT)
+                                               & SHM_HUGE_MASK);
+               size_t hugesize = ALIGN(size, huge_page_size(hs));
+
                 /* hugetlb_file_setup applies strict accounting */
                 if (shmflg & SHM_NORESERVE)
                         acctflag = VM_NORESERVE;
-               file = hugetlb_file_setup(name, 0, size, acctflag,
+               file = hugetlb_file_setup(name, hugesize, acctflag,
                                   &shp->mlock_user, HUGETLB_SHMFS_INODE,
                                 (shmflg >> SHM_HUGE_SHIFT) & SHM_HUGE_MASK);
         } else {
diff --git a/kernel/fork.c b/kernel/fork.c

index 7d40687b14344a934437d91ae3a8be065980c466..987b28a1f01b6c6ce5d554eb22d9c89e1cde1e7b 100644 (file)
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -70,6 +70,7 @@
  #include <linux/khugepaged.h>
  #include <linux/signalfd.h>
  #include <linux/uprobes.h>
+#include <linux/aio.h>
  
  #include <asm/pgtable.h>
  #include <asm/pgalloc.h>
@@ -1303,6 +1304,10 @@ static struct task_struct *copy_process(unsigned long clone_flags,
         p->memcg_batch.do_batch = 0;
         p->memcg_batch.memcg = NULL;
  #endif
+#ifdef CONFIG_BCACHE
+       p->sequential_io        = 0;
+       p->sequential_io_avg    = 0;
+#endif
  
         /* Perform scheduler related setup. Assign this task to a CPU. */
         sched_fork(p);
diff --git a/kernel/lockdep.c b/kernel/lockdep.c

index 6a3bccba7e7df98df709c11c9284ef146eb5c00c..1f3186b37fd5390be1534f895eb413de19e0a6d7 100644 (file)
--- a/kernel/lockdep.c
+++ b/kernel/lockdep.c
@@ -2998,6 +2998,7 @@ void lockdep_init_map(struct lockdep_map *lock, const char *name,
  EXPORT_SYMBOL_GPL(lockdep_init_map);
  
  struct lock_class_key __lockdep_no_validate__;
+EXPORT_SYMBOL_GPL(__lockdep_no_validate__);
  
  static int
  print_lock_nested_lock_not_held(struct task_struct *curr,
diff --git a/kernel/printk.c b/kernel/printk.c

index 96dcfcd9a2d40fa9868ef84bf6fe6e387734f984..fa36e149442092f28dd339b03345f2a07fe91b31 100644 (file)
--- a/kernel/printk.c
+++ b/kernel/printk.c
@@ -32,6 +32,7 @@
  #include <linux/security.h>
  #include <linux/bootmem.h>
  #include <linux/memblock.h>
+#include <linux/aio.h>
  #include <linux/syscalls.h>
  #include <linux/kexec.h>
  #include <linux/kdb.h>
diff --git a/kernel/ptrace.c b/kernel/ptrace.c

index 17ae54da0ec2efb010184c38daf1c43953a6745c..aed981a3f69c180dda76bfca76411656842eec01 100644 (file)
--- a/kernel/ptrace.c
+++ b/kernel/ptrace.c
@@ -17,6 +17,7 @@
  #include <linux/ptrace.h>
  #include <linux/security.h>
  #include <linux/signal.h>
+#include <linux/uio.h>
  #include <linux/audit.h>
  #include <linux/pid_namespace.h>
  #include <linux/syscalls.h>
diff --git a/kernel/relay.c b/kernel/relay.c

index eef0d113b79ed22734b980b076e05c3aee73a64c..b91488ba2e5a7edb6e3cbdc5876adaf9ae7f2791 100644 (file)
--- a/kernel/relay.c
+++ b/kernel/relay.c
@@ -234,7 +234,6 @@ static void relay_destroy_buf(struct rchan_buf *buf)
  static void relay_remove_buf(struct kref *kref)
  {
         struct rchan_buf *buf = container_of(kref, struct rchan_buf, kref);
-       buf->chan->cb->remove_buf_file(buf->dentry);
         relay_destroy_buf(buf);
  }
  
@@ -484,6 +483,7 @@ static void relay_close_buf(struct rchan_buf *buf)
  {
         buf->finalized = 1;
         del_timer_sync(&buf->timer);
+       buf->chan->cb->remove_buf_file(buf->dentry);
         kref_put(&buf->kref, relay_remove_buf);
  }
  
diff --git a/kernel/rwsem.c b/kernel/rwsem.c

index b3c6c3fcd8474237a2e41ab4db87ee04c2435b7b..cfff1435bdfb2f1e6d8a88d797c2a205f9145daf 100644 (file)
--- a/kernel/rwsem.c
+++ b/kernel/rwsem.c
@@ -126,6 +126,15 @@ void _down_write_nest_lock(struct rw_semaphore *sem, struct lockdep_map *nest)
  
  EXPORT_SYMBOL(_down_write_nest_lock);
  
+void down_read_non_owner(struct rw_semaphore *sem)
+{
+       might_sleep();
+
+       __down_read(sem);
+}
+
+EXPORT_SYMBOL(down_read_non_owner);
+
  void down_write_nested(struct rw_semaphore *sem, int subclass)
  {
         might_sleep();
@@ -136,6 +145,13 @@ void down_write_nested(struct rw_semaphore *sem, int subclass)
  
  EXPORT_SYMBOL(down_write_nested);
  
+void up_read_non_owner(struct rw_semaphore *sem)
+{
+       __up_read(sem);
+}
+
+EXPORT_SYMBOL(up_read_non_owner);
+
  #endif
  
  
diff --git a/kernel/trace/blktrace.c b/kernel/trace/blktrace.c

index ed58a3216a6dd04ffe5c7055e850bf691a62c768..b8b8560bfb95af43d36817d743f6218f01ed0edb 100644 (file)
--- a/kernel/trace/blktrace.c
+++ b/kernel/trace/blktrace.c
@@ -1808,6 +1808,7 @@ void blk_fill_rwbs(char *rwbs, u32 rw, int bytes)
  
         rwbs[i] = '\0';
  }
+EXPORT_SYMBOL_GPL(blk_fill_rwbs);
  
  #endif /* CONFIG_EVENT_TRACING */
  
diff --git a/lib/kobject.c b/lib/kobject.c

index a65486613d79775bf87918b390d8edd608d55cef..b7e29a6056d3eb583728f0f381eaab07a61cfd03 100644 (file)
--- a/lib/kobject.c
+++ b/lib/kobject.c
@@ -529,7 +529,7 @@ struct kobject *kobject_get(struct kobject *kobj)
         return kobj;
  }
  
-static struct kobject *kobject_get_unless_zero(struct kobject *kobj)
+static struct kobject * __must_check kobject_get_unless_zero(struct kobject *kobj)
  {
         if (!kref_get_unless_zero(&kobj->kref))
                 kobj = NULL;
diff --git a/lib/lru_cache.c b/lib/lru_cache.c

index 8335d39d2ccdca072fb8964976ffdb00e91aebad..4a83ecd03650157d47ca3a68b8c6273f9dff73c8 100644 (file)
--- a/lib/lru_cache.c
+++ b/lib/lru_cache.c
@@ -365,7 +365,13 @@ static int lc_unused_element_available(struct lru_cache *lc)
         return 0;
  }
  
-static struct lc_element *__lc_get(struct lru_cache *lc, unsigned int enr, bool may_change)
+/* used as internal flags to __lc_get */
+enum {
+       LC_GET_MAY_CHANGE = 1,
+       LC_GET_MAY_USE_UNCOMMITTED = 2,
+};
+
+static struct lc_element *__lc_get(struct lru_cache *lc, unsigned int enr, unsigned int flags)
  {
         struct lc_element *e;
  
@@ -380,22 +386,31 @@ static struct lc_element *__lc_get(struct lru_cache *lc, unsigned int enr, bool
          * this enr is currently being pulled in already,
          * and will be available once the pending transaction
          * has been committed. */
-       if (e && e->lc_new_number == e->lc_number) {
+       if (e) {
+               if (e->lc_new_number != e->lc_number) {
+                       /* It has been found above, but on the "to_be_changed"
+                        * list, not yet committed.  Don't pull it in twice,
+                        * wait for the transaction, then try again...
+                        */
+                       if (!(flags & LC_GET_MAY_USE_UNCOMMITTED))
+                               RETURN(NULL);
+                       /* ... unless the caller is aware of the implications,
+                        * probably preparing a cumulative transaction. */
+                       ++e->refcnt;
+                       ++lc->hits;
+                       RETURN(e);
+               }
+               /* else: lc_new_number == lc_number; a real hit. */
                 ++lc->hits;
                 if (e->refcnt++ == 0)
                         lc->used++;
                 list_move(&e->list, &lc->in_use); /* Not evictable... */
                 RETURN(e);
         }
+       /* e == NULL */
  
         ++lc->misses;
-       if (!may_change)
-               RETURN(NULL);
-
-       /* It has been found above, but on the "to_be_changed" list, not yet
-        * committed.  Don't pull it in twice, wait for the transaction, then
-        * try again */
-       if (e)
+       if (!(flags & LC_GET_MAY_CHANGE))
                 RETURN(NULL);
  
         /* To avoid races with lc_try_lock(), first, mark us dirty
@@ -477,7 +492,27 @@ static struct lc_element *__lc_get(struct lru_cache *lc, unsigned int enr, bool
   */
  struct lc_element *lc_get(struct lru_cache *lc, unsigned int enr)
  {
-       return __lc_get(lc, enr, 1);
+       return __lc_get(lc, enr, LC_GET_MAY_CHANGE);
+}
+
+/**
+ * lc_get_cumulative - like lc_get; also finds to-be-changed elements
+ * @lc: the lru cache to operate on
+ * @enr: the label to look up
+ *
+ * Unlike lc_get this also returns the element for @enr, if it is belonging to
+ * a pending transaction, so the return values are like for lc_get(),
+ * plus:
+ *
+ * pointer to an element already on the "to_be_changed" list.
+ *     In this case, the cache was already marked %LC_DIRTY.
+ *
+ * Caller needs to make sure that the pending transaction is completed,
+ * before proceeding to actually use this element.
+ */
+struct lc_element *lc_get_cumulative(struct lru_cache *lc, unsigned int enr)
+{
+       return __lc_get(lc, enr, LC_GET_MAY_CHANGE|LC_GET_MAY_USE_UNCOMMITTED);
  }
  
  /**
@@ -648,3 +683,4 @@ EXPORT_SYMBOL(lc_seq_printf_stats);
  EXPORT_SYMBOL(lc_seq_dump_details);
  EXPORT_SYMBOL(lc_try_lock);
  EXPORT_SYMBOL(lc_is_used);
+EXPORT_SYMBOL(lc_get_cumulative);
diff --git a/lib/rwsem.c b/lib/rwsem.c

index cf0ad2ad19f5a999bb688df8edc4433e4e174832..19c5fa95e0b4d7d06587d5fc5feb2ff0295f1efe 100644 (file)
--- a/lib/rwsem.c
+++ b/lib/rwsem.c
@@ -223,7 +223,9 @@ struct rw_semaphore __sched *rwsem_down_write_failed(struct rw_semaphore *sem)
                         count = RWSEM_ACTIVE_WRITE_BIAS;
                         if (!list_is_singular(&sem->wait_list))
                                 count += RWSEM_WAITING_BIAS;
-                       if (cmpxchg(&sem->count, RWSEM_WAITING_BIAS, count) ==
+
+                       if (sem->count == RWSEM_WAITING_BIAS &&
+                           cmpxchg(&sem->count, RWSEM_WAITING_BIAS, count) ==
                                                         RWSEM_WAITING_BIAS)
                                 break;
                 }
diff --git a/mm/backing-dev.c b/mm/backing-dev.c

index 41733c5dc820af44282d382ebcc3c938eca912ac..50251749225885d958a1b531519326bd177ecf2e 100644 (file)
--- a/mm/backing-dev.c
+++ b/mm/backing-dev.c
@@ -31,13 +31,14 @@ EXPORT_SYMBOL_GPL(noop_backing_dev_info);
  static struct class *bdi_class;
  
  /*
- * bdi_lock protects updates to bdi_list and bdi_pending_list, as well as
- * reader side protection for bdi_pending_list. bdi_list has RCU reader side
+ * bdi_lock protects updates to bdi_list. bdi_list has RCU reader side
   * locking.
   */
  DEFINE_SPINLOCK(bdi_lock);
  LIST_HEAD(bdi_list);
-LIST_HEAD(bdi_pending_list);
+
+/* bdi_wq serves all asynchronous writeback tasks */
+struct workqueue_struct *bdi_wq;
  
  void bdi_lock_two(struct bdi_writeback *wb1, struct bdi_writeback *wb2)
  {
@@ -257,6 +258,11 @@ static int __init default_bdi_init(void)
  {
         int err;
  
+       bdi_wq = alloc_workqueue("writeback", WQ_MEM_RECLAIM | WQ_FREEZABLE |
+                                             WQ_UNBOUND | WQ_SYSFS, 0);
+       if (!bdi_wq)
+               return -ENOMEM;
+
         err = bdi_init(&default_backing_dev_info);
         if (!err)
                 bdi_register(&default_backing_dev_info, NULL, "default");
@@ -271,26 +277,6 @@ int bdi_has_dirty_io(struct backing_dev_info *bdi)
         return wb_has_dirty_io(&bdi->wb);
  }
  
-static void wakeup_timer_fn(unsigned long data)
-{
-       struct backing_dev_info *bdi = (struct backing_dev_info *)data;
-
-       spin_lock_bh(&bdi->wb_lock);
-       if (bdi->wb.task) {
-               trace_writeback_wake_thread(bdi);
-               wake_up_process(bdi->wb.task);
-       } else if (bdi->dev) {
-               /*
-                * When bdi tasks are inactive for long time, they are killed.
-                * In this case we have to wake-up the forker thread which
-                * should create and run the bdi thread.
-                */
-               trace_writeback_wake_forker_thread(bdi);
-               wake_up_process(default_backing_dev_info.wb.task);
-       }
-       spin_unlock_bh(&bdi->wb_lock);
-}
-
  /*
   * This function is used when the first inode for this bdi is marked dirty. It
   * wakes-up the corresponding bdi thread which should then take care of the
@@ -307,176 +293,7 @@ void bdi_wakeup_thread_delayed(struct backing_dev_info *bdi)
         unsigned long timeout;
  
         timeout = msecs_to_jiffies(dirty_writeback_interval * 10);
-       mod_timer(&bdi->wb.wakeup_timer, jiffies + timeout);
-}
-
-/*
- * Calculate the longest interval (jiffies) bdi threads are allowed to be
- * inactive.
- */
-static unsigned long bdi_longest_inactive(void)
-{
-       unsigned long interval;
-
-       interval = msecs_to_jiffies(dirty_writeback_interval * 10);
-       return max(5UL * 60 * HZ, interval);
-}
-
-/*
- * Clear pending bit and wakeup anybody waiting for flusher thread creation or
- * shutdown
- */
-static void bdi_clear_pending(struct backing_dev_info *bdi)
-{
-       clear_bit(BDI_pending, &bdi->state);
-       smp_mb__after_clear_bit();
-       wake_up_bit(&bdi->state, BDI_pending);
-}
-
-static int bdi_forker_thread(void *ptr)
-{
-       struct bdi_writeback *me = ptr;
-
-       current->flags |= PF_SWAPWRITE;
-       set_freezable();
-
-       /*
-        * Our parent may run at a different priority, just set us to normal
-        */
-       set_user_nice(current, 0);
-
-       for (;;) {
-               struct task_struct *task = NULL;
-               struct backing_dev_info *bdi;
-               enum {
-                       NO_ACTION,   /* Nothing to do */
-                       FORK_THREAD, /* Fork bdi thread */
-                       KILL_THREAD, /* Kill inactive bdi thread */
-               } action = NO_ACTION;
-
-               /*
-                * Temporary measure, we want to make sure we don't see
-                * dirty data on the default backing_dev_info
-                */
-               if (wb_has_dirty_io(me) || !list_empty(&me->bdi->work_list)) {
-                       del_timer(&me->wakeup_timer);
-                       wb_do_writeback(me, 0);
-               }
-
-               spin_lock_bh(&bdi_lock);
-               /*
-                * In the following loop we are going to check whether we have
-                * some work to do without any synchronization with tasks
-                * waking us up to do work for them. Set the task state here
-                * so that we don't miss wakeups after verifying conditions.
-                */
-               set_current_state(TASK_INTERRUPTIBLE);
-
-               list_for_each_entry(bdi, &bdi_list, bdi_list) {
-                       bool have_dirty_io;
-
-                       if (!bdi_cap_writeback_dirty(bdi) ||
-                            bdi_cap_flush_forker(bdi))
-                               continue;
-
-                       WARN(!test_bit(BDI_registered, &bdi->state),
-                            "bdi %p/%s is not registered!\n", bdi, bdi->name);
-
-                       have_dirty_io = !list_empty(&bdi->work_list) ||
-                                       wb_has_dirty_io(&bdi->wb);
-
-                       /*
-                        * If the bdi has work to do, but the thread does not
-                        * exist - create it.
-                        */
-                       if (!bdi->wb.task && have_dirty_io) {
-                               /*
-                                * Set the pending bit - if someone will try to
-                                * unregister this bdi - it'll wait on this bit.
-                                */
-                               set_bit(BDI_pending, &bdi->state);
-                               action = FORK_THREAD;
-                               break;
-                       }
-
-                       spin_lock(&bdi->wb_lock);
-
-                       /*
-                        * If there is no work to do and the bdi thread was
-                        * inactive long enough - kill it. The wb_lock is taken
-                        * to make sure no-one adds more work to this bdi and
-                        * wakes the bdi thread up.
-                        */
-                       if (bdi->wb.task && !have_dirty_io &&
-                           time_after(jiffies, bdi->wb.last_active +
-                                               bdi_longest_inactive())) {
-                               task = bdi->wb.task;
-                               bdi->wb.task = NULL;
-                               spin_unlock(&bdi->wb_lock);
-                               set_bit(BDI_pending, &bdi->state);
-                               action = KILL_THREAD;
-                               break;
-                       }
-                       spin_unlock(&bdi->wb_lock);
-               }
-               spin_unlock_bh(&bdi_lock);
-
-               /* Keep working if default bdi still has things to do */
-               if (!list_empty(&me->bdi->work_list))
-                       __set_current_state(TASK_RUNNING);
-
-               switch (action) {
-               case FORK_THREAD:
-                       __set_current_state(TASK_RUNNING);
-                       task = kthread_create(bdi_writeback_thread, &bdi->wb,
-                                             "flush-%s", dev_name(bdi->dev));
-                       if (IS_ERR(task)) {
-                               /*
-                                * If thread creation fails, force writeout of
-                                * the bdi from the thread. Hopefully 1024 is
-                                * large enough for efficient IO.
-                                */
-                               writeback_inodes_wb(&bdi->wb, 1024,
-                                                   WB_REASON_FORKER_THREAD);
-                       } else {
-                               /*
-                                * The spinlock makes sure we do not lose
-                                * wake-ups when racing with 'bdi_queue_work()'.
-                                * And as soon as the bdi thread is visible, we
-                                * can start it.
-                                */
-                               spin_lock_bh(&bdi->wb_lock);
-                               bdi->wb.task = task;
-                               spin_unlock_bh(&bdi->wb_lock);
-                               wake_up_process(task);
-                       }
-                       bdi_clear_pending(bdi);
-                       break;
-
-               case KILL_THREAD:
-                       __set_current_state(TASK_RUNNING);
-                       kthread_stop(task);
-                       bdi_clear_pending(bdi);
-                       break;
-
-               case NO_ACTION:
-                       if (!wb_has_dirty_io(me) || !dirty_writeback_interval)
-                               /*
-                                * There are no dirty data. The only thing we
-                                * should now care about is checking for
-                                * inactive bdi threads and killing them. Thus,
-                                * let's sleep for longer time, save energy and
-                                * be friendly for battery-driven devices.
-                                */
-                               schedule_timeout(bdi_longest_inactive());
-                       else
-                               schedule_timeout(msecs_to_jiffies(dirty_writeback_interval * 10));
-                       try_to_freeze();
-                       break;
-               }
-       }
-
-       return 0;
+       mod_delayed_work(bdi_wq, &bdi->wb.dwork, timeout);
  }
  
  /*
@@ -489,6 +306,9 @@ static void bdi_remove_from_list(struct backing_dev_info *bdi)
         spin_unlock_bh(&bdi_lock);
  
         synchronize_rcu_expedited();
+
+       /* bdi_list is now unused, clear it to mark @bdi dying */
+       INIT_LIST_HEAD(&bdi->bdi_list);
  }
  
  int bdi_register(struct backing_dev_info *bdi, struct device *parent,
@@ -508,20 +328,6 @@ int bdi_register(struct backing_dev_info *bdi, struct device *parent,
  
         bdi->dev = dev;
  
-       /*
-        * Just start the forker thread for our default backing_dev_info,
-        * and add other bdi's to the list. They will get a thread created
-        * on-demand when they need it.
-        */
-       if (bdi_cap_flush_forker(bdi)) {
-               struct bdi_writeback *wb = &bdi->wb;
-
-               wb->task = kthread_run(bdi_forker_thread, wb, "bdi-%s",
-                                               dev_name(dev));
-               if (IS_ERR(wb->task))
-                       return PTR_ERR(wb->task);
-       }
-
         bdi_debug_register(bdi, dev_name(dev));
         set_bit(BDI_registered, &bdi->state);
  
@@ -545,8 +351,6 @@ EXPORT_SYMBOL(bdi_register_dev);
   */
  static void bdi_wb_shutdown(struct backing_dev_info *bdi)
  {
-       struct task_struct *task;
-
         if (!bdi_cap_writeback_dirty(bdi))
                 return;
  
@@ -556,22 +360,20 @@ static void bdi_wb_shutdown(struct backing_dev_info *bdi)
         bdi_remove_from_list(bdi);
  
         /*
-        * If setup is pending, wait for that to complete first
+        * Drain work list and shutdown the delayed_work.  At this point,
+        * @bdi->bdi_list is empty telling bdi_Writeback_workfn() that @bdi
+        * is dying and its work_list needs to be drained no matter what.
          */
-       wait_on_bit(&bdi->state, BDI_pending, bdi_sched_wait,
-                       TASK_UNINTERRUPTIBLE);
+       mod_delayed_work(bdi_wq, &bdi->wb.dwork, 0);
+       flush_delayed_work(&bdi->wb.dwork);
+       WARN_ON(!list_empty(&bdi->work_list));
  
         /*
-        * Finally, kill the kernel thread. We don't need to be RCU
-        * safe anymore, since the bdi is gone from visibility.
+        * This shouldn't be necessary unless @bdi for some reason has
+        * unflushed dirty IO after work_list is drained.  Do it anyway
+        * just in case.
          */
-       spin_lock_bh(&bdi->wb_lock);
-       task = bdi->wb.task;
-       bdi->wb.task = NULL;
-       spin_unlock_bh(&bdi->wb_lock);
-
-       if (task)
-               kthread_stop(task);
+       cancel_delayed_work_sync(&bdi->wb.dwork);
  }
  
  /*
@@ -597,10 +399,8 @@ void bdi_unregister(struct backing_dev_info *bdi)
                 bdi_set_min_ratio(bdi, 0);
                 trace_writeback_bdi_unregister(bdi);
                 bdi_prune_sb(bdi);
-               del_timer_sync(&bdi->wb.wakeup_timer);
  
-               if (!bdi_cap_flush_forker(bdi))
-                       bdi_wb_shutdown(bdi);
+               bdi_wb_shutdown(bdi);
                 bdi_debug_unregister(bdi);
  
                 spin_lock_bh(&bdi->wb_lock);
@@ -622,7 +422,7 @@ static void bdi_wb_init(struct bdi_writeback *wb, struct backing_dev_info *bdi)
         INIT_LIST_HEAD(&wb->b_io);
         INIT_LIST_HEAD(&wb->b_more_io);
         spin_lock_init(&wb->list_lock);
-       setup_timer(&wb->wakeup_timer, wakeup_timer_fn, (unsigned long)bdi);
+       INIT_DELAYED_WORK(&wb->dwork, bdi_writeback_workfn);
  }
  
  /*
@@ -695,12 +495,11 @@ void bdi_destroy(struct backing_dev_info *bdi)
         bdi_unregister(bdi);
  
         /*
-        * If bdi_unregister() had already been called earlier, the
-        * wakeup_timer could still be armed because bdi_prune_sb()
-        * can race with the bdi_wakeup_thread_delayed() calls from
-        * __mark_inode_dirty().
+        * If bdi_unregister() had already been called earlier, the dwork
+        * could still be pending because bdi_prune_sb() can race with the
+        * bdi_wakeup_thread_delayed() calls from __mark_inode_dirty().
          */
-       del_timer_sync(&bdi->wb.wakeup_timer);
+       cancel_delayed_work_sync(&bdi->wb.dwork);
  
         for (i = 0; i < NR_BDI_STAT_ITEMS; i++)
                 percpu_counter_destroy(&bdi->bdi_stat[i]);
diff --git a/mm/bounce.c b/mm/bounce.c

index a5c2ec3589cb94934e8654821b0876d28e1f5510..c9f0a4339a7dafc2ba7295e49ad8fcdda8fa13de 100644 (file)
--- a/mm/bounce.c
+++ b/mm/bounce.c
@@ -101,7 +101,7 @@ static void copy_to_high_bio_irq(struct bio *to, struct bio *from)
         struct bio_vec *tovec, *fromvec;
         int i;
  
-       __bio_for_each_segment(tovec, to, i, 0) {
+       bio_for_each_segment(tovec, to, i) {
                 fromvec = from->bi_io_vec + i;
  
                 /*
@@ -134,7 +134,7 @@ static void bounce_end_io(struct bio *bio, mempool_t *pool, int err)
         /*
          * free up bounce indirect pages used
          */
-       __bio_for_each_segment(bvec, bio, i, 0) {
+       bio_for_each_segment_all(bvec, bio, i) {
                 org_vec = bio_orig->bi_io_vec + i;
                 if (bvec->bv_page == org_vec->bv_page)
                         continue;
@@ -199,78 +199,43 @@ static int must_snapshot_stable_pages(struct request_queue *q, struct bio *bio)
  static void __blk_queue_bounce(struct request_queue *q, struct bio **bio_orig,
                                mempool_t *pool, int force)
  {
-       struct page *page;
-       struct bio *bio = NULL;
-       int i, rw = bio_data_dir(*bio_orig);
+       struct bio *bio;
+       int rw = bio_data_dir(*bio_orig);
         struct bio_vec *to, *from;
+       unsigned i;
  
-       bio_for_each_segment(from, *bio_orig, i) {
-               page = from->bv_page;
+       bio_for_each_segment(from, *bio_orig, i)
+               if (page_to_pfn(from->bv_page) > queue_bounce_pfn(q))
+                       goto bounce;
  
-               /*
-                * is destination page below bounce pfn?
-                */
-               if (page_to_pfn(page) <= queue_bounce_pfn(q) && !force)
-                       continue;
-
-               /*
-                * irk, bounce it
-                */
-               if (!bio) {
-                       unsigned int cnt = (*bio_orig)->bi_vcnt;
+       return;
+bounce:
+       bio = bio_clone_bioset(*bio_orig, GFP_NOIO, fs_bio_set);
  
-                       bio = bio_alloc(GFP_NOIO, cnt);
-                       memset(bio->bi_io_vec, 0, cnt * sizeof(struct bio_vec));
-               }
-                       
+       bio_for_each_segment_all(to, bio, i) {
+               struct page *page = to->bv_page;
  
-               to = bio->bi_io_vec + i;
+               if (page_to_pfn(page) <= queue_bounce_pfn(q) && !force)
+                       continue;
  
-               to->bv_page = mempool_alloc(pool, q->bounce_gfp);
-               to->bv_len = from->bv_len;
-               to->bv_offset = from->bv_offset;
                 inc_zone_page_state(to->bv_page, NR_BOUNCE);
+               to->bv_page = mempool_alloc(pool, q->bounce_gfp);
  
                 if (rw == WRITE) {
                         char *vto, *vfrom;
  
-                       flush_dcache_page(from->bv_page);
+                       flush_dcache_page(page);
+
                         vto = page_address(to->bv_page) + to->bv_offset;
-                       vfrom = kmap(from->bv_page) + from->bv_offset;
+                       vfrom = kmap_atomic(page) + to->bv_offset;
                         memcpy(vto, vfrom, to->bv_len);
-                       kunmap(from->bv_page);
+                       kunmap_atomic(vfrom);
                 }
         }
  
-       /*
-        * no pages bounced
-        */
-       if (!bio)
-               return;
-
         trace_block_bio_bounce(q, *bio_orig);
  
-       /*
-        * at least one page was bounced, fill in possible non-highmem
-        * pages
-        */
-       __bio_for_each_segment(from, *bio_orig, i, 0) {
-               to = bio_iovec_idx(bio, i);
-               if (!to->bv_page) {
-                       to->bv_page = from->bv_page;
-                       to->bv_len = from->bv_len;
-                       to->bv_offset = from->bv_offset;
-               }
-       }
-
-       bio->bi_bdev = (*bio_orig)->bi_bdev;
         bio->bi_flags |= (1 << BIO_BOUNCED);
-       bio->bi_sector = (*bio_orig)->bi_sector;
-       bio->bi_rw = (*bio_orig)->bi_rw;
-
-       bio->bi_vcnt = (*bio_orig)->bi_vcnt;
-       bio->bi_idx = (*bio_orig)->bi_idx;
-       bio->bi_size = (*bio_orig)->bi_size;
  
         if (pool == page_pool) {
                 bio->bi_end_io = bounce_end_io_write;
diff --git a/mm/memcontrol.c b/mm/memcontrol.c

index 0f1d92163f30321caa6aac197d164fc6d46362be..cb1c9dedf9b65c08a4a6d9d6f81ee01f2cb44c36 100644 (file)
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -92,16 +92,18 @@ enum mem_cgroup_stat_index {
         /*
          * For MEM_CONTAINER_TYPE_ALL, usage = pagecache + rss.
          */
-       MEM_CGROUP_STAT_CACHE,     /* # of pages charged as cache */
-       MEM_CGROUP_STAT_RSS,       /* # of pages charged as anon rss */
-       MEM_CGROUP_STAT_FILE_MAPPED,  /* # of pages charged as file rss */
-       MEM_CGROUP_STAT_SWAP, /* # of pages, swapped out */
+       MEM_CGROUP_STAT_CACHE,          /* # of pages charged as cache */
+       MEM_CGROUP_STAT_RSS,            /* # of pages charged as anon rss */
+       MEM_CGROUP_STAT_RSS_HUGE,       /* # of pages charged as anon huge */
+       MEM_CGROUP_STAT_FILE_MAPPED,    /* # of pages charged as file rss */
+       MEM_CGROUP_STAT_SWAP,           /* # of pages, swapped out */
         MEM_CGROUP_STAT_NSTATS,
  };
  
  static const char * const mem_cgroup_stat_names[] = {
         "cache",
         "rss",
+       "rss_huge",
         "mapped_file",
         "swap",
  };
@@ -917,6 +919,7 @@ static unsigned long mem_cgroup_read_events(struct mem_cgroup *memcg,
  }
  
  static void mem_cgroup_charge_statistics(struct mem_cgroup *memcg,
+                                        struct page *page,
                                          bool anon, int nr_pages)
  {
         preempt_disable();
@@ -932,6 +935,10 @@ static void mem_cgroup_charge_statistics(struct mem_cgroup *memcg,
                 __this_cpu_add(memcg->stat->count[MEM_CGROUP_STAT_CACHE],
                                 nr_pages);
  
+       if (PageTransHuge(page))
+               __this_cpu_add(memcg->stat->count[MEM_CGROUP_STAT_RSS_HUGE],
+                               nr_pages);
+
         /* pagein of a big page is an event. So, ignore page size */
         if (nr_pages > 0)
                 __this_cpu_inc(memcg->stat->events[MEM_CGROUP_EVENTS_PGPGIN]);
@@ -2914,7 +2921,7 @@ static void __mem_cgroup_commit_charge(struct mem_cgroup *memcg,
         else
                 anon = false;
  
-       mem_cgroup_charge_statistics(memcg, anon, nr_pages);
+       mem_cgroup_charge_statistics(memcg, page, anon, nr_pages);
         unlock_page_cgroup(pc);
  
         /*
@@ -3708,16 +3715,21 @@ void mem_cgroup_split_huge_fixup(struct page *head)
  {
         struct page_cgroup *head_pc = lookup_page_cgroup(head);
         struct page_cgroup *pc;
+       struct mem_cgroup *memcg;
         int i;
  
         if (mem_cgroup_disabled())
                 return;
+
+       memcg = head_pc->mem_cgroup;
         for (i = 1; i < HPAGE_PMD_NR; i++) {
                 pc = head_pc + i;
-               pc->mem_cgroup = head_pc->mem_cgroup;
+               pc->mem_cgroup = memcg;
                 smp_wmb();/* see __commit_charge() */
                 pc->flags = head_pc->flags & ~PCGF_NOCOPY_AT_SPLIT;
         }
+       __this_cpu_sub(memcg->stat->count[MEM_CGROUP_STAT_RSS_HUGE],
+                      HPAGE_PMD_NR);
  }
  #endif /* CONFIG_TRANSPARENT_HUGEPAGE */
  
@@ -3773,11 +3785,11 @@ static int mem_cgroup_move_account(struct page *page,
                 __this_cpu_inc(to->stat->count[MEM_CGROUP_STAT_FILE_MAPPED]);
                 preempt_enable();
         }
-       mem_cgroup_charge_statistics(from, anon, -nr_pages);
+       mem_cgroup_charge_statistics(from, page, anon, -nr_pages);
  
         /* caller should have done css_get */
         pc->mem_cgroup = to;
-       mem_cgroup_charge_statistics(to, anon, nr_pages);
+       mem_cgroup_charge_statistics(to, page, anon, nr_pages);
         move_unlock_mem_cgroup(from, &flags);
         ret = 0;
  unlock:
@@ -4152,7 +4164,7 @@ __mem_cgroup_uncharge_common(struct page *page, enum charge_type ctype,
                 break;
         }
  
-       mem_cgroup_charge_statistics(memcg, anon, -nr_pages);
+       mem_cgroup_charge_statistics(memcg, page, anon, -nr_pages);
  
         ClearPageCgroupUsed(pc);
         /*
@@ -4502,7 +4514,7 @@ void mem_cgroup_replace_page_cache(struct page *oldpage,
         lock_page_cgroup(pc);
         if (PageCgroupUsed(pc)) {
                 memcg = pc->mem_cgroup;
-               mem_cgroup_charge_statistics(memcg, false, -1);
+               mem_cgroup_charge_statistics(memcg, oldpage, false, -1);
                 ClearPageCgroupUsed(pc);
         }
         unlock_page_cgroup(pc);
@@ -5030,6 +5042,10 @@ static inline u64 mem_cgroup_usage(struct mem_cgroup *memcg, bool swap)
                         return res_counter_read_u64(&memcg->memsw, RES_USAGE);
         }
  
+       /*
+        * Transparent hugepages are still accounted for in MEM_CGROUP_STAT_RSS
+        * as well as in MEM_CGROUP_STAT_RSS_HUGE.
+        */
         val = mem_cgroup_recursive_stat(memcg, MEM_CGROUP_STAT_CACHE);
         val += mem_cgroup_recursive_stat(memcg, MEM_CGROUP_STAT_RSS);
  
diff --git a/mm/mmap.c b/mm/mmap.c

index da3e9c04bf370fe5d591d72aa6222a8c47ec402c..1ae21d645c681f517abfe0feef252d5205a8acd1 100644 (file)
--- a/mm/mmap.c
+++ b/mm/mmap.c
@@ -1363,15 +1363,20 @@ SYSCALL_DEFINE6(mmap_pgoff, unsigned long, addr, unsigned long, len,
                 file = fget(fd);
                 if (!file)
                         goto out;
+               if (is_file_hugepages(file))
+                       len = ALIGN(len, huge_page_size(hstate_file(file)));
         } else if (flags & MAP_HUGETLB) {
                 struct user_struct *user = NULL;
+
+               len = ALIGN(len, huge_page_size(hstate_sizelog(
+                       (flags >> MAP_HUGE_SHIFT) & MAP_HUGE_MASK)));
                 /*
                  * VM_NORESERVE is used because the reservations will be
                  * taken when vm_ops->mmap() is called
                  * A dummy user value is used because we are not locking
                  * memory so no accounting is necessary
                  */
-               file = hugetlb_file_setup(HUGETLB_ANON_FILE, addr, len,
+               file = hugetlb_file_setup(HUGETLB_ANON_FILE, len,
                                 VM_NORESERVE,
                                 &user, HUGETLB_ANONHUGE_INODE,
                                 (flags >> MAP_HUGE_SHIFT) & MAP_HUGE_MASK);
diff --git a/mm/mmu_context.c b/mm/mmu_context.c

index 3dcfaf4ed355a3deb6dee71dab84bdaa98a7e9d4..8a8cd0265e523b54909e456881a00d497fa456d3 100644 (file)
--- a/mm/mmu_context.c
+++ b/mm/mmu_context.c
@@ -14,9 +14,6 @@
   * use_mm
   *     Makes the calling kernel thread take on the specified
   *     mm context.
- *     Called by the retry thread execute retries within the
- *     iocb issuer's mm context, so that copy_from/to_user
- *     operations work seamlessly for aio.
   *     (Note: this routine is intended to be called only
   *     from a kernel thread context)
   */
diff --git a/mm/page_io.c b/mm/page_io.c

index bb5d752746867b717e7281ee1b03c44d14acd979..a8a3ef45fed753b68ac1cc4a94c9260979a37879 100644 (file)
--- a/mm/page_io.c
+++ b/mm/page_io.c
@@ -20,6 +20,7 @@
  #include <linux/buffer_head.h>
  #include <linux/writeback.h>
  #include <linux/frontswap.h>
+#include <linux/aio.h>
  #include <asm/pgtable.h>
  
  static struct bio *get_swap_bio(gfp_t gfp_flags,
@@ -35,7 +36,6 @@ static struct bio *get_swap_bio(gfp_t gfp_flags,
                 bio->bi_io_vec[0].bv_len = PAGE_SIZE;
                 bio->bi_io_vec[0].bv_offset = 0;
                 bio->bi_vcnt = 1;
-               bio->bi_idx = 0;
                 bio->bi_size = PAGE_SIZE;
                 bio->bi_end_io = end_io;
         }
diff --git a/mm/shmem.c b/mm/shmem.c

index 39b2a0b86fe83854d279397c13c84b32b20fe8f0..5e6a8422658b832921196ca4908e7cc6148eb84a 100644 (file)
--- a/mm/shmem.c
+++ b/mm/shmem.c
@@ -31,6 +31,7 @@
  #include <linux/mm.h>
  #include <linux/export.h>
  #include <linux/swap.h>
+#include <linux/aio.h>
  
  static struct vfsmount *shm_mnt;
  
diff --git a/mm/slab_common.c b/mm/slab_common.c

index d2517b05d5bc488ba986c1f3c2993051dc9bd736..ff3218a0f5e14029aab5c23dd323f447d2a86173 100644 (file)
--- a/mm/slab_common.c
+++ b/mm/slab_common.c
@@ -446,18 +446,18 @@ void __init create_kmalloc_caches(unsigned long flags)
                 if (!kmalloc_caches[i]) {
                         kmalloc_caches[i] = create_kmalloc_cache(NULL,
                                                         1 << i, flags);
+               }
  
-                       /*
-                        * Caches that are not of the two-to-the-power-of size.
-                        * These have to be created immediately after the
-                        * earlier power of two caches
-                        */
-                       if (KMALLOC_MIN_SIZE <= 32 && !kmalloc_caches[1] && i == 6)
-                               kmalloc_caches[1] = create_kmalloc_cache(NULL, 96, flags);
+               /*
+                * Caches that are not of the two-to-the-power-of size.
+                * These have to be created immediately after the
+                * earlier power of two caches
+                */
+               if (KMALLOC_MIN_SIZE <= 32 && !kmalloc_caches[1] && i == 6)
+                       kmalloc_caches[1] = create_kmalloc_cache(NULL, 96, flags);
  
-                       if (KMALLOC_MIN_SIZE <= 64 && !kmalloc_caches[2] && i == 7)
-                               kmalloc_caches[2] = create_kmalloc_cache(NULL, 192, flags);
-               }
+               if (KMALLOC_MIN_SIZE <= 64 && !kmalloc_caches[2] && i == 7)
+                       kmalloc_caches[2] = create_kmalloc_cache(NULL, 192, flags);
         }
  
         /* Kmalloc array is now usable */
diff --git a/mm/swap.c b/mm/swap.c

index acd40bfffa8287cb0942297feb9dd0ceceb8e368..dfd7d71d68418023b592c09a7d18c7a3ac4c5843 100644 (file)
--- a/mm/swap.c
+++ b/mm/swap.c
@@ -30,6 +30,7 @@
  #include <linux/backing-dev.h>
  #include <linux/memcontrol.h>
  #include <linux/gfp.h>
+#include <linux/uio.h>
  
  #include "internal.h"
  
diff --git a/mm/vmalloc.c b/mm/vmalloc.c

index b12fd8612604f26bfca3006fd07bcbad27d73040..d365724feb05206fc0b37c852f5859499f1cb6b3 100644 (file)
--- a/mm/vmalloc.c
+++ b/mm/vmalloc.c
@@ -1522,6 +1522,8 @@ static void __vunmap(const void *addr, int deallocate_pages)
   *     Must not be called in NMI context (strictly speaking, only if we don't
   *     have CONFIG_ARCH_HAVE_NMI_SAFE_CMPXCHG, but making the calling
   *     conventions for vfree() arch-depenedent would be a really bad idea)
+ *
+ *     NOTE: assumes that the object at *addr has a size >= sizeof(llist_node)
   *     
   */
  void vfree(const void *addr)
diff --git a/net/core/dev.c b/net/core/dev.c

index 40b1fadaf637cafb12a8aece038882f8ba045274..fc1e289397f5895f3d2191ee78c9b450cc33e5cf 100644 (file)
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -2213,6 +2213,17 @@ __be16 skb_network_protocol(struct sk_buff *skb)
         __be16 type = skb->protocol;
         int vlan_depth = ETH_HLEN;
  
+       /* Tunnel gso handlers can set protocol to ethernet. */
+       if (type == htons(ETH_P_TEB)) {
+               struct ethhdr *eth;
+
+               if (unlikely(!pskb_may_pull(skb, sizeof(struct ethhdr))))
+                       return 0;
+
+               eth = (struct ethhdr *)skb_mac_header(skb);
+               type = eth->h_proto;
+       }
+
         while (type == htons(ETH_P_8021Q) || type == htons(ETH_P_8021AD)) {
                 struct vlan_hdr *vh;
  
diff --git a/net/ipv4/gre.c b/net/ipv4/gre.c

index cc22363965d2a9a66d08e9368cc4e62e2ccea08a..b2e805af9b87a03675d7bac1a7e210124757e566 100644 (file)
--- a/net/ipv4/gre.c
+++ b/net/ipv4/gre.c
@@ -150,13 +150,7 @@ static struct sk_buff *gre_gso_segment(struct sk_buff *skb,
                 csum = false;
  
         /* setup inner skb. */
-       if (greh->protocol == htons(ETH_P_TEB)) {
-               struct ethhdr *eth = (struct ethhdr *)skb_inner_mac_header(skb);
-               skb->protocol = eth->h_proto;
-       } else {
-               skb->protocol = greh->protocol;
-       }
-
+       skb->protocol = greh->protocol;
         skb->encapsulation = 0;
  
         if (unlikely(!pskb_may_pull(skb, ghl)))
diff --git a/net/ipv4/udp.c b/net/ipv4/udp.c

index 0ae038a4c7a80929e2083911d3fc17aa5432be3f..0bf5d399a03c1c0eaedeedd4a6a9db4ee2af68e6 100644 (file)
--- a/net/ipv4/udp.c
+++ b/net/ipv4/udp.c
@@ -2311,7 +2311,6 @@ static struct sk_buff *skb_udp_tunnel_segment(struct sk_buff *skb,
         struct sk_buff *segs = ERR_PTR(-EINVAL);
         int mac_len = skb->mac_len;
         int tnl_hlen = skb_inner_mac_header(skb) - skb_transport_header(skb);
-       struct ethhdr *inner_eth = (struct ethhdr *)skb_inner_mac_header(skb);
         __be16 protocol = skb->protocol;
         netdev_features_t enc_features;
         int outer_hlen;
@@ -2324,8 +2323,7 @@ static struct sk_buff *skb_udp_tunnel_segment(struct sk_buff *skb,
         skb_reset_mac_header(skb);
         skb_set_network_header(skb, skb_inner_network_offset(skb));
         skb->mac_len = skb_inner_network_offset(skb);
-       inner_eth = (struct ethhdr *)skb_mac_header(skb);
-       skb->protocol = inner_eth->h_proto;
+       skb->protocol = htons(ETH_P_TEB);
  
         /* segment inner packet. */
         enc_features = skb->dev->hw_enc_features & netif_skb_features(skb);
diff --git a/net/sunrpc/auth_gss/auth_gss.c b/net/sunrpc/auth_gss/auth_gss.c

index a764e227fddeb06955226e7a7d622282e168b481..7da6b457f66abfab016fd8b21aeedcb14d5e7ff0 100644 (file)
--- a/net/sunrpc/auth_gss/auth_gss.c
+++ b/net/sunrpc/auth_gss/auth_gss.c
@@ -867,8 +867,7 @@ gss_create(struct rpc_clnt *clnt, rpc_authflavor_t flavor)
         err = -EINVAL;
         gss_auth->mech = gss_mech_get_by_pseudoflavor(flavor);
         if (!gss_auth->mech) {
-               printk(KERN_WARNING "%s: Pseudoflavor %d not found!\n",
-                               __func__, flavor);
+               dprintk("RPC:       Pseudoflavor %d not found!\n", flavor);
                 goto err_free;
         }
         gss_auth->service = gss_pseudoflavor_to_service(gss_auth->mech, flavor);
diff --git a/net/sunrpc/clnt.c b/net/sunrpc/clnt.c

index 3f7930f938cc295c79cdec097c1db4a9109a6086..5a750b9c36404b34a3b41bd0e2d38628f881a5f2 100644 (file)
--- a/net/sunrpc/clnt.c
+++ b/net/sunrpc/clnt.c
@@ -360,7 +360,7 @@ static struct rpc_clnt * rpc_new_client(const struct rpc_create_args *args, stru
  
         auth = rpcauth_create(args->authflavor, clnt);
         if (IS_ERR(auth)) {
-               printk(KERN_INFO "RPC: Couldn't create auth handle (flavor %u)\n",
+               dprintk("RPC:       Couldn't create auth handle (flavor %u)\n",
                                 args->authflavor);
                 err = PTR_ERR(auth);
                 goto out_no_auth;
diff --git a/security/keys/internal.h b/security/keys/internal.h

index 8bbefc3b55d42961f94c1c8727b4db8f9026f3f9..d4f1468b9b50f46cd7d739544902a77a3ff40384 100644 (file)
--- a/security/keys/internal.h
+++ b/security/keys/internal.h
@@ -16,6 +16,8 @@
  #include <linux/key-type.h>
  #include <linux/task_work.h>
  
+struct iovec;
+
  #ifdef __KDEBUG
  #define kenter(FMT, ...) \
         printk(KERN_DEBUG "==> %s("FMT")\n", __func__, ##__VA_ARGS__)
diff --git a/security/keys/keyctl.c b/security/keys/keyctl.c

index 4b5c948eb41426c76ef1810239cb0a98f9c4f918..33cfd27b4de29650ae6ad0e1eb45646714a00f27 100644 (file)
--- a/security/keys/keyctl.c
+++ b/security/keys/keyctl.c
@@ -22,6 +22,7 @@
  #include <linux/err.h>
  #include <linux/vmalloc.h>
  #include <linux/security.h>
+#include <linux/uio.h>
  #include <asm/uaccess.h>
  #include "internal.h"
  
diff --git a/sound/core/pcm_native.c b/sound/core/pcm_native.c

index 23e3c46cd0a4eda841d66d0fd637e55e88397564..ccfa383f1fda33d5cd8e4d6ed16e42ed875cc5a1 100644 (file)
--- a/sound/core/pcm_native.c
+++ b/sound/core/pcm_native.c
@@ -25,7 +25,7 @@
  #include <linux/slab.h>
  #include <linux/time.h>
  #include <linux/pm_qos.h>
-#include <linux/uio.h>
+#include <linux/aio.h>
  #include <linux/dma-mapping.h>
  #include <sound/core.h>
  #include <sound/control.h>
author	Olof Johansson <olof@lixom.net>
	Thu, 9 May 2013 20:14:02 +0000 (13:14 -0700)
committer	Olof Johansson <olof@lixom.net>
	Thu, 9 May 2013 20:14:02 +0000 (13:14 -0700)